iconv(): Reborn

Fix issues when getting from and to encodings from iconv descriptor in iconv()
Make iconv() simpler to understand
Make iconv() more systematic and fix some issues along the way
Implement errno setting based on return values from converters (*.h files like utf8.h).
Add Windows-1251 handling as CP1251.




git-svn-id: svn://kolibrios.org@7057 a494cfbc-eb01-0410-851d-a64ba20cac60
This commit is contained in:
ashmew2 2017-10-08 11:58:56 +00:00
parent 47b22ed331
commit da7cd23cb8

View File

@ -1,14 +1,11 @@
#include <string.h> #include <string.h>
//#include <stdio.h> #include <stdio.h>
typedef unsigned int size_t; #include <errno.h>
#define NULL ((void*)0)
typedef int conv_t; typedef int conv_t;
typedef unsigned int ucs4_t; typedef unsigned int ucs4_t;
typedef int iconv_t; typedef int iconv_t;
/* Return code if invalid input after a shift sequence of n bytes was read. /* Return code if invalid input after a shift sequence of n bytes was read.
(xxx_mbtowc) */ (xxx_mbtowc) */
#define RET_SHIFT_ILSEQ(n) (-1-2*(n)) #define RET_SHIFT_ILSEQ(n) (-1-2*(n))
@ -22,7 +19,6 @@ typedef int iconv_t;
/* Return code if output buffer is too small. (xxx_wctomb, xxx_reset) */ /* Return code if output buffer is too small. (xxx_wctomb, xxx_reset) */
#define RET_TOOSMALL -2 #define RET_TOOSMALL -2
#define CP866 0 #define CP866 0
#define CP1251 1 #define CP1251 1
#define CP1252 2 #define CP1252 2
@ -37,8 +33,9 @@ typedef int iconv_t;
#include "iso8859_5.h" #include "iso8859_5.h"
#include "utf8.h" #include "utf8.h"
int encoding(char *what) { int encoding(const char *someencoding) {
char *what = strdup(someencoding);
/* Ignore //TRANSLIT or //IGNORE for now. */ /* Ignore //TRANSLIT or //IGNORE for now. */
int i; int i;
for(i = 0; i < strlen(what); i++) { for(i = 0; i < strlen(what); i++) {
@ -50,6 +47,7 @@ int encoding(char *what) {
if (!strcasecmp(what,"CP866")) return CP866; if (!strcasecmp(what,"CP866")) return CP866;
if (!strcasecmp(what,"CP1251")) return CP1251; if (!strcasecmp(what,"CP1251")) return CP1251;
if (!strcasecmp(what,"windows-1251")) return CP1251;
if (!strcasecmp(what,"windows-1252")) return CP1252; if (!strcasecmp(what,"windows-1252")) return CP1252;
if (!strcasecmp(what,"CP1252")) return CP1252; if (!strcasecmp(what,"CP1252")) return CP1252;
if (!strcasecmp(what,"KOI8-RU")) return KOI8_RU; if (!strcasecmp(what,"KOI8-RU")) return KOI8_RU;
@ -58,7 +56,6 @@ int encoding(char *what) {
return -1; return -1;
} }
iconv_t iconv_open(const char *tocode, const char *fromcode) { iconv_t iconv_open(const char *tocode, const char *fromcode) {
int to, from; int to, from;
@ -67,6 +64,7 @@ iconv_t iconv_open(const char *tocode, const char *fromcode) {
to=to<<16&0xFFFF0000; to=to<<16&0xFFFF0000;
from=from&0xFFFF; from=from&0xFFFF;
return to+from; return to+from;
} }
@ -78,16 +76,15 @@ int iconv_close(iconv_t icd)
size_t iconv(iconv_t cd, const char **inbuf, size_t *inbytesleft, size_t iconv(iconv_t cd, const char **inbuf, size_t *inbytesleft,
char **outbuf, size_t *outbytesleft) char **outbuf, size_t *outbytesleft)
{ {
int n, to, from, count1,count2; int n, to, from;
int pwc, converted,written; size_t count1,count2;
unsigned int pwc;
int converted,written;
int (*mbtowc)(conv_t, ucs4_t *, const unsigned char *, int); int (*mbtowc)(conv_t, ucs4_t *, const unsigned char *, int);
int (*wctomb)(conv_t, ucs4_t *, const unsigned char *, int); int (*wctomb)(conv_t, unsigned char *, ucs4_t, int);
char *str; to=cd>>16;
str=*outbuf; from=cd&0xFFFF;
from=cd>>16;
to=cd&0xFFFF;
switch (from) switch (from)
{ {
@ -97,7 +94,7 @@ size_t iconv(iconv_t cd, const char **inbuf, size_t *inbytesleft,
case ISO8859_5: mbtowc=iso8859_5_mbtowc; break; case ISO8859_5: mbtowc=iso8859_5_mbtowc; break;
case KOI8_RU: mbtowc=koi8_ru_mbtowc; break; case KOI8_RU: mbtowc=koi8_ru_mbtowc; break;
case UTF_8: mbtowc=utf8_mbtowc; break; case UTF_8: mbtowc=utf8_mbtowc; break;
default: return -2; default: return (size_t)-1;
} }
switch (to) switch (to)
@ -108,126 +105,155 @@ size_t iconv(iconv_t cd, const char **inbuf, size_t *inbytesleft,
case ISO8859_5: wctomb=iso8859_5_wctomb; break; case ISO8859_5: wctomb=iso8859_5_wctomb; break;
case KOI8_RU: wctomb=koi8_ru_wctomb; break; case KOI8_RU: wctomb=koi8_ru_wctomb; break;
case UTF_8: wctomb=utf8_wctomb; break; case UTF_8: wctomb=utf8_wctomb; break;
default: return -3; default: return (size_t)-1;
}
if(from == to) {
int oc=0,ic=0;
while(*inbytesleft > 0 && *outbytesleft > 0) {
str[oc]=(*inbuf)[ic];
++ic;
++oc;
(*inbytesleft)--;
(*outbytesleft)--;
(*outbuf)++;
}
return 0;
} }
count1=0; count1=0;
count2=0; count2=0;
while ( *inbytesleft>0 && *outbytesleft>1) /* Convert input multibyte char to wide character by using calls to mbtowc */
{ /* Convert wide character to multibyte by calls to wctomb */
n=1; /* Handle errors as we go on converting to be as standard compliant as possible */
while(count1 < *inbytesleft) {
unsigned char mbholder[] = { 0,0,0,0,0,0 };
do { int numbytes = (mbtowc)(0, &pwc,((*inbuf)+count1), *inbytesleft - count1);
//converted= (utf8_mbtowc)(0,&pwc,((*inbuf)+count1),n); if(numbytes < 0) {
// printf("%d\n",n); /* errno = EILSEQ if invalid multibyte sequence encountered in input */
converted= (mbtowc)(0,&pwc,((*inbuf)+count1),n); /* errno = EINVAL if input ends in the middle of a multibyte sequence */
n++; switch(numbytes) {
} while (converted==RET_TOOFEW(0)); case RET_TOOFEW(0):
errno = EINVAL;
break;
if (converted<0) return -10; case RET_ILSEQ:
//written= (cp866_wctomb)(0,str+count2,pwc,1); errno = EILSEQ;
written= (wctomb)(0,str+count2,pwc,1); break;
if (written<0) written=0;//return -11;
//printf("Conv:%d Wri:%d In:%d Out:%d UTF:%x UCS:%x 866:%s\n",converted, written, *inbytesleft,*outbytesleft,*((*inbuf)+count1),pwc, str);
(*inbytesleft)-=converted;
(*outbytesleft)-=written;
(*outbuf)+=written;
count1+=converted;
count2+=written;
} }
*(str+count2)='\0';
if (*inbytesleft>0 && *outbytesleft==0) return -12; *inbytesleft -= count1;
return 0; *outbytesleft -= count2;
*inbuf += count1;
*outbuf += count2;
return (size_t) -1;
}
/* Convert from wide to multibyte storing result in mbholder and num converted in numbytes2 */
/* Pass the minimum amount of space we have, one from mbholder and one from remaining in outbuf */
int minspace = sizeof(mbholder) <= (*outbytesleft - count2) ? sizeof(mbholder) : (*outbytesleft - count2);
int numbytes2 = (wctomb)(0, &mbholder[0], pwc, minspace);
if(numbytes2 < 0) {
switch(numbytes2) {
case RET_ILUNI:
errno = EILSEQ;
break;
case RET_TOOSMALL:
errno = E2BIG;
break;
}
*inbytesleft -= count1;
*outbytesleft -= count2;
*inbuf += count1;
*outbuf += count2;
return (size_t) -1;
}
int i;
for(i = 0; i < numbytes2; i++) {
*(*outbuf + count2 + i) = mbholder[i];
}
count1+=numbytes;
count2+=numbytes2;
}
/* Successfully converted everything, update the variables and return number of bytes converted */
*inbytesleft -= count1;
*outbytesleft -= count2;
*inbuf += count1;
*outbuf += count2;
return count1;
} }
/* int main() */
/* { */
/* char *s;// ="вертолет"; */
/* char *z; */
/* //unsigned int pwc; */
/* iconv_t cd; */
/* size_t in, out; */
/* /* FILE *infile; */
int main() /* char *fname = "file3.txt"; */
{
char *s;// ="вертолет";
char *z;
//unsigned int pwc;
iconv_t cd;
int in, out;
FILE *infile; /* size_t testmax = 100; */
char *fname = "file.txt"; /* size_t test = 0; */
infile = fopen(fname,"r"); /* infile = fopen(fname,"r"); */
fseek(infile, 0, SEEK_END); /* fseek(infile, 0, SEEK_END); */
size_t file_size = ftell(infile); /* size_t file_size = ftell(infile); */
rewind(infile); /* rewind(infile); */
//printf ("LOL\n"); /* char *buffer = (char*)malloc(file_size * sizeof(char)); */
/* if (buffer == NULL) */
/* { */
/* fclose(infile); */
/* printf("Error allocating %d bytes.\n", file_size * sizeof(char)); */
/* return -1; */
/* } */
/* size_t bytes_read = fread(buffer, sizeof(char), file_size, infile); */
/* if (bytes_read != file_size) */
/* { */
/* /\* printf("Have read only %d bytes of %d.\n", bytes_read, file_size); *\/ */
/* free(buffer); */
/* fclose(infile); */
/* return -1; */
/* } */
char *buffer = (char*)malloc(file_size * sizeof(char)); /* /\* in=strlen(buffer); *\/ */
if (buffer == NULL) /* in = bytes_read; */
{ /* z=malloc(in+12000); */
fclose(infile);
printf("Error allocating %d bytes.\n", file_size * sizeof(char));
return -1;
}
size_t bytes_read = fread(buffer, sizeof(char), file_size, infile);
if (bytes_read != file_size)
{
printf("Have read only %d bytes of %d.\n", bytes_read, file_size);
free(buffer);
fclose(infile);
return -1;
}
in=strlen(buffer); /* out=in-1000; */
z=malloc(in+1); /* cd=iconv_open("UTF-8","UTF-8"); */
/* // printf("%x\n",cd); */
/* int t; */
/* char *zor = z; */
out=in+1; /* /\* for(t = 0; t < 27400; t++) *\/ */
cd=iconv_open("CP1251","CP866"); /* /\* printf("0x%x,", buffer[t]); *\/ */
// printf("%x\n",cd);
int t;
t=iconv(cd, &buffer, &in, &z, &out);
printf("\nResult: %d", t);
puts(z);
//for (;s<s+strlen(s);s++) {cp866_mbtowc (0, &pwc, s, 1);printf("%c=%u\n",*s,pwc);}
}
*/
/* t=iconv(cd, &buffer, &in, &z, &out); */
/* /\* printf("\nResult after iconv(): %d", t); *\/ */
typedef struct /* /\* for(t = 0; t < 24259; t++) *\/ */
{ /* /\* printf("%c", zor[t]); *\/ */
char *name;
void *f;
} export_t;
char szStart[] = "START"; /* //for (;s<s+strlen(s);s++) {cp866_mbtowc (0, &pwc, s, 1);printf("%c=%u\n",*s,pwc);} */
char szVersion[] = "version"; /* } */
char sziconv_open[] = "iconv_open";
char sziconv[] = "iconv";
export_t EXPORTS[] __asm__("EXPORTS") = /* typedef struct */
{ /* { */
{ szStart, (void*)0x0 }, /* char *name; */
{ szVersion, (void*)0x00010001 }, /* void *f; */
{ sziconv_open, iconv_open }, /* } export_t; */
{ sziconv, iconv },
{ NULL, NULL }, /* char szStart[] = "START"; */
}; /* char szVersion[] = "version"; */
/* char sziconv_open[] = "iconv_open"; */
/* char sziconv[] = "iconv"; */
/* export_t EXPORTS[] __asm__("EXPORTS") = */
/* { */
/* { szStart, (void*)0x0 }, */
/* { szVersion, (void*)0x00010001 }, */
/* { sziconv_open, iconv_open }, */
/* { sziconv, iconv }, */
/* { NULL, NULL }, */
/* }; */