From da7cd23cb843b4a0a63493ee80a913a8a42cefe9 Mon Sep 17 00:00:00 2001 From: ashmew2 Date: Sun, 8 Oct 2017 11:58:56 +0000 Subject: [PATCH] iconv(): Reborn Fix issues when getting from and to encodings from iconv descriptor in iconv() Make iconv() simpler to understand Make iconv() more systematic and fix some issues along the way Implement errno setting based on return values from converters (*.h files like utf8.h). Add Windows-1251 handling as CP1251. git-svn-id: svn://kolibrios.org@7057 a494cfbc-eb01-0410-851d-a64ba20cac60 --- programs/develop/libraries/iconv/iconv.c | 286 ++++++++++++----------- 1 file changed, 156 insertions(+), 130 deletions(-) diff --git a/programs/develop/libraries/iconv/iconv.c b/programs/develop/libraries/iconv/iconv.c index 9a855cc06d..b502fc9936 100644 --- a/programs/develop/libraries/iconv/iconv.c +++ b/programs/develop/libraries/iconv/iconv.c @@ -1,14 +1,11 @@ #include -//#include -typedef unsigned int size_t; -#define NULL ((void*)0) +#include +#include typedef int conv_t; typedef unsigned int ucs4_t; - typedef int iconv_t; - /* Return code if invalid input after a shift sequence of n bytes was read. (xxx_mbtowc) */ #define RET_SHIFT_ILSEQ(n) (-1-2*(n)) @@ -22,7 +19,6 @@ typedef int iconv_t; /* Return code if output buffer is too small. (xxx_wctomb, xxx_reset) */ #define RET_TOOSMALL -2 - #define CP866 0 #define CP1251 1 #define CP1252 2 @@ -37,8 +33,9 @@ typedef int iconv_t; #include "iso8859_5.h" #include "utf8.h" -int encoding(char *what) { +int encoding(const char *someencoding) { + char *what = strdup(someencoding); /* Ignore //TRANSLIT or //IGNORE for now. */ int i; for(i = 0; i < strlen(what); i++) { @@ -50,6 +47,7 @@ int encoding(char *what) { if (!strcasecmp(what,"CP866")) return CP866; if (!strcasecmp(what,"CP1251")) return CP1251; + if (!strcasecmp(what,"windows-1251")) return CP1251; if (!strcasecmp(what,"windows-1252")) return CP1252; if (!strcasecmp(what,"CP1252")) return CP1252; if (!strcasecmp(what,"KOI8-RU")) return KOI8_RU; @@ -58,7 +56,6 @@ int encoding(char *what) { return -1; } - iconv_t iconv_open(const char *tocode, const char *fromcode) { int to, from; @@ -67,6 +64,7 @@ iconv_t iconv_open(const char *tocode, const char *fromcode) { to=to<<16&0xFFFF0000; from=from&0xFFFF; + return to+from; } @@ -78,17 +76,16 @@ int iconv_close(iconv_t icd) size_t iconv(iconv_t cd, const char **inbuf, size_t *inbytesleft, char **outbuf, size_t *outbytesleft) { - int n, to, from, count1,count2; - int pwc, converted,written; + int n, to, from; + size_t count1,count2; + unsigned int pwc; + int converted,written; int (*mbtowc)(conv_t, ucs4_t *, const unsigned char *, int); - int (*wctomb)(conv_t, ucs4_t *, const unsigned char *, int); - - char *str; - str=*outbuf; - - from=cd>>16; - to=cd&0xFFFF; - + int (*wctomb)(conv_t, unsigned char *, ucs4_t, int); + + to=cd>>16; + from=cd&0xFFFF; + switch (from) { case CP866: mbtowc=cp866_mbtowc; break; @@ -97,9 +94,9 @@ size_t iconv(iconv_t cd, const char **inbuf, size_t *inbytesleft, case ISO8859_5: mbtowc=iso8859_5_mbtowc; break; case KOI8_RU: mbtowc=koi8_ru_mbtowc; break; case UTF_8: mbtowc=utf8_mbtowc; break; - default: return -2; + default: return (size_t)-1; } - + switch (to) { case CP866: wctomb=cp866_wctomb; break; @@ -108,126 +105,155 @@ size_t iconv(iconv_t cd, const char **inbuf, size_t *inbytesleft, case ISO8859_5: wctomb=iso8859_5_wctomb; break; case KOI8_RU: wctomb=koi8_ru_wctomb; break; case UTF_8: wctomb=utf8_wctomb; break; - default: return -3; + default: return (size_t)-1; } - if(from == to) { - int oc=0,ic=0; - - while(*inbytesleft > 0 && *outbytesleft > 0) { - str[oc]=(*inbuf)[ic]; - ++ic; - ++oc; - (*inbytesleft)--; - (*outbytesleft)--; - (*outbuf)++; - } - - return 0; - } - count1=0; count2=0; - - while ( *inbytesleft>0 && *outbytesleft>1) - { - n=1; - - do { - //converted= (utf8_mbtowc)(0,&pwc,((*inbuf)+count1),n); - // printf("%d\n",n); - converted= (mbtowc)(0,&pwc,((*inbuf)+count1),n); - - n++; - } while (converted==RET_TOOFEW(0)); - - if (converted<0) return -10; - //written= (cp866_wctomb)(0,str+count2,pwc,1); - written= (wctomb)(0,str+count2,pwc,1); - if (written<0) written=0;//return -11; - - //printf("Conv:%d Wri:%d In:%d Out:%d UTF:%x UCS:%x 866:%s\n",converted, written, *inbytesleft,*outbytesleft,*((*inbuf)+count1),pwc, str); - - (*inbytesleft)-=converted; - (*outbytesleft)-=written; - (*outbuf)+=written; - count1+=converted; - count2+=written; - } - *(str+count2)='\0'; - - if (*inbytesleft>0 && *outbytesleft==0) return -12; - return 0; + + /* Convert input multibyte char to wide character by using calls to mbtowc */ + /* Convert wide character to multibyte by calls to wctomb */ + /* Handle errors as we go on converting to be as standard compliant as possible */ + while(count1 < *inbytesleft) { + unsigned char mbholder[] = { 0,0,0,0,0,0 }; + + int numbytes = (mbtowc)(0, &pwc,((*inbuf)+count1), *inbytesleft - count1); + if(numbytes < 0) { + /* errno = EILSEQ if invalid multibyte sequence encountered in input */ + /* errno = EINVAL if input ends in the middle of a multibyte sequence */ + + switch(numbytes) { + case RET_TOOFEW(0): + errno = EINVAL; + break; + + case RET_ILSEQ: + errno = EILSEQ; + break; + } + + *inbytesleft -= count1; + *outbytesleft -= count2; + *inbuf += count1; + *outbuf += count2; + return (size_t) -1; + } + + /* Convert from wide to multibyte storing result in mbholder and num converted in numbytes2 */ + /* Pass the minimum amount of space we have, one from mbholder and one from remaining in outbuf */ + int minspace = sizeof(mbholder) <= (*outbytesleft - count2) ? sizeof(mbholder) : (*outbytesleft - count2); + + int numbytes2 = (wctomb)(0, &mbholder[0], pwc, minspace); + if(numbytes2 < 0) { + switch(numbytes2) { + case RET_ILUNI: + errno = EILSEQ; + break; + case RET_TOOSMALL: + errno = E2BIG; + break; + } + + *inbytesleft -= count1; + *outbytesleft -= count2; + *inbuf += count1; + *outbuf += count2; + + return (size_t) -1; + } + + int i; + for(i = 0; i < numbytes2; i++) { + *(*outbuf + count2 + i) = mbholder[i]; + } + + count1+=numbytes; + count2+=numbytes2; + } + + /* Successfully converted everything, update the variables and return number of bytes converted */ + *inbytesleft -= count1; + *outbytesleft -= count2; + *inbuf += count1; + *outbuf += count2; + + return count1; } +/* int main() */ +/* { */ +/* char *s;// ="вертолет"; */ +/* char *z; */ +/* //unsigned int pwc; */ +/* iconv_t cd; */ +/* size_t in, out; */ -/* -int main() -{ - char *s;// ="вертолет"; - char *z; - //unsigned int pwc; - iconv_t cd; - int in, out; - - FILE *infile; - char *fname = "file.txt"; - - infile = fopen(fname,"r"); - - fseek(infile, 0, SEEK_END); - size_t file_size = ftell(infile); - rewind(infile); +/* FILE *infile; */ +/* char *fname = "file3.txt"; */ - //printf ("LOL\n"); +/* size_t testmax = 100; */ +/* size_t test = 0; */ - char *buffer = (char*)malloc(file_size * sizeof(char)); - if (buffer == NULL) - { - fclose(infile); - printf("Error allocating %d bytes.\n", file_size * sizeof(char)); - return -1; - } - size_t bytes_read = fread(buffer, sizeof(char), file_size, infile); - if (bytes_read != file_size) - { - printf("Have read only %d bytes of %d.\n", bytes_read, file_size); - free(buffer); - fclose(infile); - return -1; - } - - in=strlen(buffer); - z=malloc(in+1); - - out=in+1; - cd=iconv_open("CP1251","CP866"); -// printf("%x\n",cd); - int t; - t=iconv(cd, &buffer, &in, &z, &out); - printf("\nResult: %d", t); - puts(z); - //for (;s