diff -ru -x po gnumeric-0.63-orig/configure.in gnumeric-0.63/configure.in --- gnumeric-0.63-orig/configure.in Sun Mar 11 17:15:59 2001 +++ gnumeric-0.63/configure.in Tue Mar 13 16:48:41 2001 @@ -85,6 +85,10 @@ dnl check for complete locale implementation AC_CHECK_HEADERS(langinfo.h) +dnl we need iconv for translating to/from locale's charset when +dnl reading/writing Excel files +AC_CHECK_HEADERS(iconv.h) + dnl ************************************************** dnl * ORBit support dnl ************************************************** diff -ru -x po gnumeric-0.63-orig/plugins/excel/ms-excel-read.c gnumeric-0.63/plugins/excel/ms-excel-read.c --- gnumeric-0.63-orig/plugins/excel/ms-excel-read.c Sun Mar 11 17:17:05 2001 +++ gnumeric-0.63/plugins/excel/ms-excel-read.c Mon Mar 12 10:23:16 2001 @@ -6,9 +6,11 @@ * Jody Goldberg (jgoldberg@home.com) * * (C) 1998, 1999, 2000 Michael Meeks, Jody Goldberg + * unicode and national language support (C) 2001 by Vlad Harchev **/ #include +#include #include "ms-formula-read.h" #include "ms-excel-read.h" @@ -55,6 +57,8 @@ extern int ms_excel_object_debug; extern int gnumeric_debugging; +static excel_iconv_t current_workbook_iconv = NULL; + /* Forward references */ static ExcelSheet *ms_excel_sheet_new (ExcelWorkbook *wb, const char *name); @@ -169,6 +173,53 @@ } } +static char * +get_chars (const char *ptr, guint length, gboolean high_byte) +{ + char* ans; + guint32 lp; + + if (high_byte) { + wchar_t* wc = g_new (wchar_t, length + 2); + int retlength; + ans = g_new (char, (length+2)*8); + + for (lp = 0; lp < length; lp++) { + guint16 c = MS_OLE_GET_GUINT16 (ptr); + ptr+=2; + wc[lp] = c; + } + + retlength = wcstombs(ans, wc, length); + g_free(wc); + if (retlength == (size_t)-1) + retlength = 0; + + ans[retlength] = 0; + ans = g_realloc(ans, retlength + 2); + } else { + size_t inbytes = length, + outbytes = (length+2)*8, + retlength; + char* inbuf = g_new(char, length), *outbufptr; + char* inbufptr = inbuf; + + ans = g_new (char, outbytes + 1); + outbufptr = ans; + for (lp = 0; lp < length; lp++) { + inbuf[lp] = MS_OLE_GET_GUINT8 (ptr); + ptr+=1; + }; + excel_iconv(current_workbook_iconv,&inbufptr,&inbytes,&outbufptr,&outbytes); + + retlength = outbufptr-ans; + ans[retlength] = 0; + ans = g_realloc(ans,retlength+1); + g_free(inbuf); + }; + return ans; +} + /** * This function takes a length argument as Biff V7 has a byte length * ( seemingly ). @@ -185,7 +236,6 @@ guint32 byte_len; gboolean header; gboolean high_byte; - static gboolean high_byte_warned = FALSE; gboolean ext_str; gboolean rich_str; @@ -207,8 +257,6 @@ } #endif - ans = (char *) g_new (char, length + 2); - header = biff_string_get_flags (pos, &high_byte, &ext_str, @@ -219,12 +267,6 @@ } else ptr = pos; - /* A few friendly warnings */ - if (high_byte && !high_byte_warned) { - printf ("FIXME: unicode support unimplemented: truncating\n"); - high_byte_warned = TRUE; - } - { guint32 pre_len, end_len; @@ -242,51 +284,16 @@ } #endif - for (lp = 0; lp < length; lp++) { - guint16 c; - - if (high_byte) { - c = MS_OLE_GET_GUINT16 (ptr); - ptr+=2; - ans[lp] = (char)c; - (*byte_length) += 2; - } else { - c = MS_OLE_GET_GUINT8 (ptr); - ptr+=1; - ans[lp] = (char)c; - (*byte_length) += 1; - } - } - if (lp > 0) - ans[lp] = 0; - else + if (!length) { + ans = g_new (char, 2); g_warning ("Warning unterminated string floating"); + } else { + (*byte_length) += (high_byte ? 2 : 1)*length; + ans = get_chars(ptr, length, high_byte); + }; return ans; } -static char * -get_utf8_chars (const char *ptr, guint len, gboolean high_byte) -{ - int i; - char *ans = g_new (char, len + 1); - - for (i = 0; i < len; i++) { - guint16 c; - - if (high_byte) { - c = MS_OLE_GET_GUINT16 (ptr); - ptr+=2; - ans [i] = (char)c; - } else { - c = MS_OLE_GET_GUINT8 (ptr); - ptr+=1; - ans [i] = (char)c; - } - } - ans [i] = '\0'; - - return ans; -} static guint32 sst_bound_check (BiffQuery *q, guint32 offset) @@ -366,7 +373,7 @@ g_assert (get_len >= 0); /* FIXME: split this simple bit out of here, it makes more sense damnit */ - str = get_utf8_chars (q->data + new_offset + pre_len, get_len, high_byte); + str = get_chars (q->data + new_offset + pre_len, get_len, high_byte); new_offset += pre_len + get_len * (high_byte?2:1); if (!(*output)) @@ -587,12 +594,23 @@ ans->hidden = MS_BIFF_H_VISIBLE; break; } +#if 0 if (ver == MS_BIFF_V8) { - int slen = MS_OLE_GET_GUINT16 (q->data + 6); + int slen = MS_OLE_GET_GUINT16 (q->data + 6); ans->name = biff_get_text (q->data + 8, slen, NULL); - } else { + } else +#endif + { + /* + * there are test files produced by non-latin1 Excel (e.g. + * russian version) that prove that branch above is + * incorrect. It seems test files that insured author of branch + * above were produced by latin1 version of Excel - + * in that case q->data[7] is always 0, so it can be attributed + * to length of sheet name or to the string header. + * - Vlad Harchev + */ int slen = MS_OLE_GET_GUINT8 (q->data + 6); - ans->name = biff_get_text (q->data + 7, slen, NULL); } @@ -4172,6 +4190,8 @@ /* MW: And on Excel seems to drive the display of currency amounts. */ const guint16 codepage = MS_OLE_GET_GUINT16 (q->data); + excel_iconv_close(current_workbook_iconv); + current_workbook_iconv = excel_iconv_open_for_import(codepage); #ifndef NO_DEBUG_EXCEL if (ms_excel_read_debug > 0) { switch(codepage) { @@ -4319,7 +4339,7 @@ fflush (stdout); } #endif - + excel_iconv_close(current_workbook_iconv); if (wb) { /* Cleanup */ ms_excel_workbook_destroy (wb); diff -ru -x po gnumeric-0.63-orig/plugins/excel/ms-excel-util.c gnumeric-0.63/plugins/excel/ms-excel-util.c --- gnumeric-0.63-orig/plugins/excel/ms-excel-util.c Tue Oct 31 20:21:05 2000 +++ gnumeric-0.63/plugins/excel/ms-excel-util.c Tue Mar 13 16:46:47 2001 @@ -5,6 +5,7 @@ * Jon K Hellan (hellan@acm.org) * * (C) 1999, 2000 Jon K Hellan + * excel_iconv* family of functions (C) 2001 by Vlad Harchev **/ #include "config.h" @@ -14,7 +15,16 @@ #include "ms-excel-util.h" #include +#include +#ifdef HAVE_LANGINFO_H +#include +#endif + +#ifdef HAVE_ICONV_H +#define HAVE_ICONV +#include +#endif extern int ms_excel_read_debug; /* @@ -318,4 +328,161 @@ /* Use a rough heuristic for unknown fonts. */ return .5625 * size_pts; +} + + + +static char* +get_locale_charset_name() +{ +#ifndef HAVE_ICONV + return ""; +#else + static char* charset = NULL; + + if (charset) + return charset; + +#ifdef _NL_CTYPE_CODESET_NAME + charset = nl_langinfo (_NL_CTYPE_CODESET_NAME); +#elif defined(CODESET) + charset = nl_langinfo (CODESET); +#elif + { + char* locale = setlocale(LC_CTYPE,NULL); + char* tmp = strchr(locale,'.'); + if (tmp) + charset = tmp+1; + } +#endif + if (!charset) + charset = "ISO-8859-1"; + charset = g_strdup(charset); + return charset; +#endif +} + +typedef struct +{ + const char** keys;/*NULL-terminated list*/ + int value; +} s_hash_entry; + +/* here is a list of languages for which cp1251 is used on Windows*/ +static const char* cyr_locales[] = +{ + "russian", "ru", "be", "uk", "ukrainian", NULL +}; + +static const s_hash_entry win_codepages[]= +{ + { cyr_locales , 1251 }, + { NULL } /*terminator*/ +}; + +guint +excel_iconv_win_codepage() +{ + char* lang = NULL; + static guint codepage = 0; + char* env_lang; + + if (codepage) + return codepage; + + /* the code below is executed only once */ + if (env_lang = getenv("WINDOWS_LANGUAGE")) + lang = env_lang; /* just for flexibility */ + else { + char* locale = setlocale(LC_CTYPE,NULL); + char* lang_sep = strchr(locale,'_'); + if (lang_sep) + lang = g_strndup(locale,lang_sep-locale); + else + lang = locale; + } + lang = g_strdup(lang); + + { + const s_hash_entry* entry; + for(entry = win_codepages; entry->keys; ++entry) { + const char** key; + for(key=entry->keys; *key; ++key) { + if (!g_strcasecmp(*key,lang)) { + codepage = entry->value; + return codepage; + }; + }; + } + } + codepage = 1252; /*default one*/ + return codepage; +} + +/*these two will figure out which charset names to use*/ +excel_iconv_t +excel_iconv_open_for_import(guint codepage) +{ +#ifndef HAVE_ICONV + return (excel_iconv_t)(-1); +#else + char* src_charset; + iconv_t iconv_handle; + + src_charset = g_strdup_printf("CP%d",codepage); + iconv_handle = iconv_open(get_locale_charset_name(), src_charset); + g_free(src_charset); + return iconv_handle; +#endif +} + +excel_iconv_t +excel_iconv_open_for_export() +{ +#ifndef HAVE_ICONV + return (excel_iconv_t)(-1); +#else + static char* dest_charset = NULL; + iconv_t iconv_handle; + + if (!dest_charset) + dest_charset = g_strdup_printf("CP%d",excel_iconv_win_codepage()); + iconv_handle = iconv_open(dest_charset, get_locale_charset_name()); + return iconv_handle; +#endif +}; + +void +excel_iconv_close(excel_iconv_t handle) +{ +#ifdef HAVE_ICONV + if (handle && handle != (excel_iconv_t)(-1)) + iconv_close(handle); +#endif +} + +size_t +excel_iconv(excel_iconv_t handle,char ** const inbuf, size_t *inbytesleft, + char **outbuf, size_t *outbytesleft) +{ +#ifndef HAVE_ICONV + guint tocopy = *inbytesleft <= *outbytesleft ? *inbytesleft : *outbytesleft; + memcpy(*outbuf,*inbuf,tocopy); + *outbuf += tocopy; + *inbuf += tocopy; + *outbytesleft -= tocopy; + *inbytesleft -= tocopy; +#else + while (*inbytesleft){ + if (handle && handle!=(iconv_t)(-1)) + iconv((iconv_t)handle, inbuf, inbytesleft, + outbuf, outbytesleft); + if (!*inbytesleft || !*outbytesleft) + return 0; + /*got invalid seq - so replace it with original character*/ + **outbuf = **inbuf; (*outbuf)++; (*outbytesleft)--; + (*inbuf)++; (*inbytesleft)--; + }; +#endif + return 0; } diff -ru -x po gnumeric-0.63-orig/plugins/excel/ms-excel-util.h gnumeric-0.63/plugins/excel/ms-excel-util.h --- gnumeric-0.63-orig/plugins/excel/ms-excel-util.h Fri Mar 24 19:36:02 2000 +++ gnumeric-0.63/plugins/excel/ms-excel-util.h Mon Mar 12 02:10:47 2001 @@ -10,6 +10,7 @@ #define GNUMERIC_MS_EXCEL_UTIL_H #include +#include #include "sheet.h" typedef struct _TwoWayTable TwoWayTable; @@ -51,5 +52,38 @@ double lookup_font_base_char_width_new (char const * const name, double size_pts, gboolean const is_default); + + +/* a group of iconv_* - like functions, with safe fallbacks if iconv is + unavailable. Sorry for stupid prefix - Vlad Harchev */ +typedef void* excel_iconv_t;/*can't be NULL or (-1) */ + +/* + this returns code of the codepage that should be used when exporting + .xls files (it's guessed by looking at language name). Fallback is 1252. +*/ +guint +excel_iconv_win_codepage(); + +/*these two will figure out which charset names to use*/ +excel_iconv_t +excel_iconv_open_for_import(guint codepage); + +excel_iconv_t +excel_iconv_open_for_export(); + +void +excel_iconv_close(excel_iconv_t handle); +/*if fails (or if compiled without support for iconv), it will + copy the input string to output and pretend that all worked fine. + If some char is non-convertable, it will replace that char with "?". + + It's required that inbytesleft <= outbytesleft (so that fallback will be + able to work). As for now, return value is not meaningfull at all - 0 is + always returned. +*/ +size_t +excel_iconv(excel_iconv_t handle,char ** const inbuf, size_t *inbytesleft, + char **outbuf, size_t *outbytesleft); #endif diff -ru -x po gnumeric-0.63-orig/plugins/excel/ms-excel-write.c gnumeric-0.63/plugins/excel/ms-excel-write.c --- gnumeric-0.63-orig/plugins/excel/ms-excel-write.c Sun Mar 11 17:17:05 2001 +++ gnumeric-0.63/plugins/excel/ms-excel-write.c Mon Mar 12 10:18:59 2001 @@ -57,6 +57,7 @@ #include "ms-excel-xf.h" #include "ms-formula-write.h" +static excel_iconv_t current_workbook_iconv = NULL; /** * This function writes simple strings... * FIXME: see S59D47.HTM for full description @@ -112,13 +113,31 @@ } ms_biff_put_var_write (bp, data, off); -/* You got it coming */ - for (lp = 0; lp < len; lp++) { - MS_OLE_SET_GUINT16 (data, txt[lp]); - ms_biff_put_var_write (bp, data, unicode?2:1); - } - return off + len*(unicode?2:1); - + if (unicode) { + wchar_t* wcbuf = g_new(wchar_t,len); + len = mbstowcs(wcbuf,txt,len); + for (lp = 0; lp < len; lp++) { + MS_OLE_SET_GUINT16 (data, wcbuf[lp]); + ms_biff_put_var_write (bp, data, 2); + } + g_free(wcbuf); + lp *= 2; + } else { + size_t inbufleft = len, outbufleft = len*8; + char* mbbuf = g_new(char, outbufleft); + char* inbufptr = txt, *outbufptr = mbbuf; + int retlen; + + excel_iconv(current_workbook_iconv, &inbufptr, &inbufleft, + &outbufptr, &outbufleft); + retlen = outbufptr - mbbuf; + for (lp = 0; lp < retlen; lp++) { + MS_OLE_SET_GUINT8 (data, mbbuf[lp]); + ms_biff_put_var_write (bp, data, 1); + } + g_free(mbbuf); + }; + return off + lp; /* An attempt at efficiency */ /* chunks = len/BLK_LEN; pos = 0; @@ -367,7 +386,7 @@ /* See: S59D66.HTM */ data = ms_biff_put_len_next (bp, BIFF_CODEPAGE, 2); - MS_OLE_SET_GUINT16 (data, 0x04e4); /* ANSI */ + MS_OLE_SET_GUINT16 (data, excel_iconv_win_codepage()); ms_biff_put_commit (bp); if (ver >= MS_BIFF_V8) { /* See S59D78.HTM */ @@ -2974,7 +2993,7 @@ MS_OLE_SET_GUINT32 (data + 20, 0x3fe00000); MS_OLE_SET_GUINT32 (data + 24, 0x00000000); MS_OLE_SET_GUINT32 (data + 28, 0x3fe00000); - MS_OLE_SET_GUINT16 (data + 32, 0x04e4); + MS_OLE_SET_GUINT16 (data + 32, excel_iconv_win_codepage()); ms_biff_put_commit (bp); write_externsheets (bp, sheet->wb, sheet); @@ -3436,6 +3455,7 @@ ExcelSheet *s = 0; int lp; + current_workbook_iconv = excel_iconv_open_for_export(); /* Workbook */ wb->streamPos = biff_bof_write (bp, ver, MS_BIFF_TYPE_Workbook); @@ -3473,6 +3493,8 @@ s->streamPos); } /* End Finalised workbook */ + excel_iconv_close (current_workbook_iconv); + current_workbook_iconv = NULL; } /*