/* Conversion of files between different charsets and surfaces. Copyright © 1990, 93, 97, 98, 99 Free Software Foundation, Inc. Contributed by François Pinard , 1988. The `recode' Library is free software; you can redistribute it and/or modify it under the terms of the GNU Library General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. The `recode' Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public License for more details. You should have received a copy of the GNU Library General Public License along with the `recode' Library; see the file `COPYING.LIB'. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ #include "common.h" /******************************************************************** * This is an implementation of UCS-2 -> RTF translation, * derived from the html.c file in reode. Strictly speaking, * RTF is both a charset (in some cases, as for non-breaking * spaces) and a surface mapped on either: * * 1. ansi = CP1252 1252 ms-ansi windows-1252 (default) * 2. mac = macintosh charset * 3. pc = IBM Codepage 437 * 4. pca = IBM Codepage 850 * 5. Character table swap directives -> certain charset * 6. Unicode * * When an RTF writer encodes a file, first the basic charset * should be used. As a second alternative, it should swap * charset with a sequence involving \fNN commands. As a last resort, * unicode can be used, but with a supplementing "simple" * representation using the common method, for backward compatibility. * * All RTF files are -- eh, should be -- stricty 7-bit. * * RTF readers / writers are supposed to detect which encoding is * used and translate it automatically, which however they seldom * do. Most of the translators out there, including some of * Microsofts own implementations, assume ISO8859-1 surface * encoding. For example the RTF documentation says that Word 1.0 * does not implement pca / CP 850. With Microsoft Works, the * situation is even worse. * * This implementation treats RTF as a charset on its own. The * implementation presented here is based on Microsoft's own * published documentation of the RTF format. * * NOTE ON IMPLEMENTATION OF AN RTF READER * --------------------------------------- * Recode will not decode the RTF file and its groupings for you. * It will not deal with the actual document structure. It will only * decode the character escapes and similar. If you send recode a * "{" or a "}" it will be treated as nothing more than simple * characters. Backslashed keywords, say \foo, will just be copied * to output unless it signifies a character or escape sequence. * * Given a sequence like this: * * {\upr{\*\bkmstart LabGValue}{\*\ud{\*\bkmstart Lab\u915 Value}}} * * You will have to make sure for yourself that the unicode * sequence is sent to recode, not the first (ANSI) representation. * Same thing around: when you want to write unicode RTF, create * two Outer objects, one running ANSI and one running Unicode, * then fold the two resulting strings into the stream as in the * example above (which you will inevitably understand if you're * working with RTF writers). * * The workings of Recode regarding RTF handling is currently * limited to being a valuable library for the character recoding, * not as a RTF parser library. The same is true for, say, LaTeX * files - Recode doesn't build LaTeX-valid files, it converts from * and to the character escape sequences used in LaTeX. * * NOT FULLY IMPLEMENTED UNICODE SUPPORT * ------------------------------------- * Unicode in RTF uses many weird tricks. Firstly it selects a * "default" codepage, then it only specifically escapes the chars * that do not fit in there. So I have to work on a good solution * for this -- the best would be to gather statistical evidence of * which codepage should be prefered before selecting it (in RTF * output) and the reverse: make sure the right codepage is selected * based on specific \ansicpg keywords. That's tricky. * * DIACRITICS HANDLING * ------------------- * When invoked without any extra arguments, the rtf..ucs2 (or any * other charset) conversion will throw away all RTF-syntax markup. * This makes possible direct conversion from RTF to text file. * If you want to preserve the RTF-markup, use the -d (--diacritics) * option. * * On converting from ucs2..rtf the diacritics flag will avoid * escaping the control characters "\", "{" and "}". ********************************************************************/ struct codepost { unsigned code; /* code being translated */ const char *string; /* translation string */ }; static struct codepost translations [] = { {0x0009, "tab"}, /* tab */ {0x2014, "emdash"}, /* dash of m-width */ {0x2013, "endash"}, /* dash of n-width */ {0x2003, "emspace"}, /* space of m-width */ {0x2002, "enspace"}, /* space of n-width */ {0x2022, "bullet"}, /* bullet */ {0x2018, "lquote"}, /* single quote left */ {0x2019, "rquote"}, /* single quote right */ {0x201C, "ldblquote"}, /* double quote left */ {0x201D, "rdblquote"}, /* double quote right */ {0x200E, "ltrmark"}, /* left-to-right mark */ {0x200F, "rtlmark"}, /* right-to-left mark */ {0x200D, "zwj"}, /* zero width joiner */ {0x200C, "zwnj"}, /* zero width non-joiner */ {0, NULL} }; /* * A list of windows defined character set codes and their * Recode alias counterparts * * Not all charsets are supported by recode as of now. * when they are supported, make them appear here. The * cross-reference is derived from Windows header files * in Mingw32, information from the Wine emulator * documentation for character set cross reference * (fonts.c), the RTF 1.3 spec, and Mozilla documentation. * Some theoretically possible charsets still lack * documentation. * * OEM_CHARSET is dynamic and cannot be supported. */ /*--------------------------------------------------------. | Character set definitions. These are Windows internal | | character set #define-d variables. The table maps them | | onto the Recode equivalents. | `--------------------------------------------------------*/ struct charpost { unsigned windef_code; /* code being translated */ char bytes; /* One byte per char or multibyte? */ unsigned codepage; /* numerical codepage representation */ const char *recode_charset_name; /* translation string */ }; static struct charpost charsets [] = { {0, 1, 1252, "CP1252"}, /* ANSI_CHARSET (wingdi.h) */ {1, 2, 0, "UCS2"}, /* DEFAULT_CHARSET (Mozilla) */ {2, 1, 0, ""}, /* SYMBOL_CHARSET (wingdi.h) */ {77, 1, 0, "macintosh"}, /* MAC_CHARSET (wingdi.h) */ {128, 2, 932, "CP932"}, /* SHIFTJIS_CHARSET (Wine) */ {129, 2, 949, "CP949"}, /* HANGEUL_CHARSET (Wine) */ {130, 2, 1361, "CP1361"}, /* JOHAB_CHARSET (Wine) */ {134, 2, 936, "CP936"}, /* GB2312_CHARSET (Wine) */ {136, 2, 950, "CP950"}, /* CHINESEBIG5_CHARSET (Wine) */ {161, 1, 1253, "CP1253"}, /* GREEK_CHARSET (wingdi.h) */ {162, 1, 1254, "CP1254"}, /* TURKISH_CHARSET (wingdi.h) */ {163, 2, 1258, "CP1258"}, /* VIETNAMESE_CHARSET (Mozilla) */ {177, 1, 1255, "CP1255"}, /* HEBREW_CHARSET (wingdi.h) */ {178, 1, 1256, "CP1256"}, /* ARABIC_CHARSET former ARABICSIMPLIFIED_CHARSET (RTF 1.3) */ {179, 1, 0, ""}, /* ARABICTRADITIONAL_CHARSET - obsolete? (RTF 1.3) */ {180, 1, 0, ""}, /* ARABICUSER_CHARSET - obsolete? (RTF 1.3) */ {181, 1, 0, ""}, /* HEBREWUSER_CHARSET - obsolete? (RTF 1.3) */ {186, 1, 1257, "CP1257"}, /* BALTIC_CHARSET (wingdi.h) */ {204, 1, 1251, "CP1251"}, /* RUSSIAN_CHARSET former CYRILLIC_CHARSET (RTF 1.3) */ {222, 1, 874, "CP874"}, /* THAI_CHARSET (Wine) */ {238, 1, 1250, "CP1250"}, /* EASTEUROPE_CHARSET former EASTERNEUROPE_CHARSET (RTF 1.3) */ {254, 1, 437, "IBM437"}, /* PC437_CHARSET - obsolete? (RTF 1.3) */ {255, 1, 0, ""}, /* OEM_CHARSET (wingdi.h) */ {0, 0, 0, NULL} }; /* The encodings have 5 possible values */ enum rtf_encodings {RTF_ANSI, RTF_MAC, RTF_PC, RTF_PCA, RTF_UNICODE }; static int rtf_default_encoding; /* UCS-2 towards RTF. */ /*-----------------. | Initialisation. | `------------------*/ static bool init_ucs2_rtf_ansi (RECODE_STEP step, RECODE_CONST_REQUEST request, RECODE_CONST_OPTION_LIST before_options, RECODE_CONST_OPTION_LIST after_options) { rtf_default_encoding = RTF_ANSI; return true; } static bool init_ucs2_rtf_mac (RECODE_STEP step, RECODE_CONST_REQUEST request, RECODE_CONST_OPTION_LIST before_options, RECODE_CONST_OPTION_LIST after_options) { rtf_default_encoding = RTF_MAC; return true; } static bool init_ucs2_rtf_pc (RECODE_STEP step, RECODE_CONST_REQUEST request, RECODE_CONST_OPTION_LIST before_options, RECODE_CONST_OPTION_LIST after_options) { rtf_default_encoding = RTF_PC; return true; } static bool init_ucs2_rtf_pca (RECODE_STEP step, RECODE_CONST_REQUEST request, RECODE_CONST_OPTION_LIST before_options, RECODE_CONST_OPTION_LIST after_options) { rtf_default_encoding = RTF_PCA; return true; } static bool init_ucs2_rtf_unicode (RECODE_STEP step, RECODE_CONST_REQUEST request, RECODE_CONST_OPTION_LIST before_options, RECODE_CONST_OPTION_LIST after_options) { rtf_default_encoding = RTF_UNICODE; return true; } /*-----------------. | Transformation. | `-----------------*/ static bool transform_ucs2_rtf (RECODE_SUBTASK subtask) { RECODE_CONST_REQUEST request = subtask->task->request; RECODE_OUTER outer = subtask->task->request->outer; RECODE_REQUEST local_request = recode_new_request(outer); RECODE_TASK task = NULL; unsigned value; struct codepost *cursor1; char const *cursor2; switch (rtf_default_encoding) { case RTF_MAC: recode_scan_request(local_request, "ucs2..macintosh"); break; case RTF_PC: recode_scan_request(local_request, "ucs2..CP437"); break; case RTF_PCA: recode_scan_request(local_request, "ucs2..CP850"); break; default: recode_scan_request(local_request, "ucs2..CP1252"); break; } while (get_ucs2 (&value, subtask)) { bool found = false; /* * FIXME: * Special chars. Is the table big enough to be faster * if we use a hash lookup instead? Decide. */ for (cursor1 = translations; cursor1->code; cursor1++) { if (cursor1->code == value) { found = true; break; } } if (found) { put_byte('\\', subtask); for (cursor2 = cursor1->string; *cursor2; cursor2++) put_byte(*cursor2, subtask); put_byte(' ', subtask); } /* * Paragraph and line separators are accepted * from ucs2, but not written out the other way * (rtf..ucs2). The reason is that the other * output filters still don't know how to handle * these characters (0x2028, 0x2029). Rule of * thumb: be promiscuous in what you accept, be * puritan in what you write out. */ else if (value == 0x2029) { put_byte('\\', subtask); put_byte('p', subtask); put_byte('a', subtask); put_byte('r', subtask); put_byte(' ', subtask); } else if (value == 0x2028) { put_byte('\\', subtask); put_byte('l', subtask); put_byte('i', subtask); put_byte('n', subtask); put_byte('e', subtask); put_byte(' ', subtask); } else if (value == 0x2011) { put_byte('\\', subtask); put_byte('_', subtask); } else if (value == 0xA0) { put_byte('\\', subtask); put_byte('~', subtask); } else if (value == 0xAD) { put_byte('\\', subtask); put_byte('-', subtask); } /* Escaped chars */ else if (value == '\\' || value == '{' || value == '}') { if (!request->diacritics_only) put_byte('\\', subtask); put_byte(value, subtask); } else { /* * FIXME: * Set enc_succeed variable to true only if the character * could be converted to the current charset. * * If encoding fails, value should keep its value, and * current_cp_approx should be set to the closest equivalent * in CP 1252. */ bool enc_succeed; char current_cp_approx; static char temp_string[3]; temp_string[0] = (unsigned char) value >> 8; temp_string[1] = (unsigned char) value & 255; temp_string[2] = '\0'; /* Recode using a Task */ task = recode_new_task(local_request); if (!task) { if (recode_if_nogo(RECODE_UNTRANSLATABLE, subtask)) { recode_delete_request(local_request); SUBTASK_RETURN (subtask); } else { value = (unsigned) '?'; } } else { task->input.buffer = temp_string; task->input.cursor = temp_string; task->input.limit = temp_string + 2; task->output.buffer = NULL; task->output.cursor = NULL; task->output.limit = NULL; task->strategy = RECODE_SEQUENCE_IN_MEMORY; task->fail_level = RECODE_AMBIGUOUS_OUTPUT; enc_succeed = recode_perform_task (task); if (enc_succeed) /* First byte in buffer as all possible charsets are one-byte */ value = (unsigned) task->output.buffer[0]; if (task->output.limit - task->output.buffer) free (task->output.buffer); recode_delete_task (task); } /* Ugly fix: closest approximation is always '?' * this is how Microsoft does it anyways... */ current_cp_approx = '?'; /* If the recoding didn't succeed, unicode will be used below */ if (enc_succeed) { /* Common characters pass thru */ if ((value > (unsigned) 31) && (value < (unsigned) 127)) { put_byte(value, subtask); } /* If the value is in the codepage, encode it... */ else if (value > (unsigned) 127) { /* Hex lookup table */ static char *cHex = "0123456789abcdef"; put_byte('\\', subtask); put_byte('\'', subtask); put_byte(cHex[(value >> 4) & 15], subtask); put_byte(cHex[value & 15], subtask); } else if (value < (unsigned) 32) { /* * Perhaps these should all be ignored? * they are valid but meaningless in RTF but perhaps not * so good to have there for confusing RTF readers. */ put_byte(value, subtask); } } else { /* * This is where unicode kicks in if everything * else fails. If even unicode fails, return an * encoding error to the subtask. * * As we cannot emit font switching commands * without full control of the font table, we * will try unicode, else we fail. */ if (rtf_default_encoding == RTF_UNICODE) { char buffer[32]; char *cursor; /* FIXME: add \ucN keyword output on double-bytes */ /* Unicode token output */ if (sprintf(buffer, "%d", value)) { put_byte('\\', subtask); put_byte('u', subtask); for (cursor = buffer; *cursor; cursor++) put_byte(*cursor, subtask); put_byte(current_cp_approx, subtask); } } else { /* Stop if the abort level has been reached */ if (recode_if_nogo(RECODE_UNTRANSLATABLE, subtask)) { recode_delete_request(local_request); SUBTASK_RETURN (subtask); } } } } } recode_delete_request(local_request); SUBTASK_RETURN (subtask); } /* RTF towards UCS-2. */ /* RTF Documentation says control words cannot be longer than * 32 characters, this adds a termination byte. */ #define KEYWORD_BUFFER_LENGTH 33 /*-----------------. | Initialisation. | `-----------------*/ static RECODE_SYMBOL rtf_default_charset; static RECODE_SYMBOL ansi_charset; /* windows ANSI_CHARSET */ static RECODE_SYMBOL mac_charset; /* windows MAC_CHARSET */ static RECODE_SYMBOL pc_charset; /* windows PC437_CHARSET (?) */ static RECODE_SYMBOL pca_charset; static bool init_common (RECODE_STEP step, RECODE_CONST_REQUEST request, RECODE_CONST_OPTION_LIST before_options, RECODE_CONST_OPTION_LIST after_options) { RECODE_OUTER outer = request->outer; RECODE_ALIAS alias; /* Initialize ALL charsets for recoding */ alias = find_alias (outer, "CP1252", ALIAS_FIND_AS_CHARSET); if (alias) ansi_charset = alias->symbol; else return false; alias = find_alias (outer, "macintosh", ALIAS_FIND_AS_CHARSET); if (alias) mac_charset = alias->symbol; else return false; alias = find_alias (outer, "IBM437", ALIAS_FIND_AS_CHARSET); if (alias) pc_charset = alias->symbol; else return false; alias = find_alias (outer, "IBM850", ALIAS_FIND_AS_CHARSET); if (alias) pca_charset = alias->symbol; else return false; } static bool init_rtf_ansi_ucs2 (RECODE_STEP step, RECODE_CONST_REQUEST request, RECODE_CONST_OPTION_LIST before_options, RECODE_CONST_OPTION_LIST after_options) { rtf_default_encoding = RTF_ANSI; if (!init_common (step, request, before_options, after_options)) return false; else rtf_default_charset = ansi_charset; return true; } static bool init_rtf_mac_ucs2 (RECODE_STEP step, RECODE_CONST_REQUEST request, RECODE_CONST_OPTION_LIST before_options, RECODE_CONST_OPTION_LIST after_options) { rtf_default_encoding = RTF_MAC; if (!init_common (step, request, before_options, after_options)) return false; else rtf_default_charset = mac_charset; return true; } static bool init_rtf_pc_ucs2 (RECODE_STEP step, RECODE_CONST_REQUEST request, RECODE_CONST_OPTION_LIST before_options, RECODE_CONST_OPTION_LIST after_options) { rtf_default_encoding = RTF_PC; if (!init_common (step, request, before_options, after_options)) return false; else rtf_default_charset = pc_charset; return true; } static bool init_rtf_pca_ucs2 (RECODE_STEP step, RECODE_CONST_REQUEST request, RECODE_CONST_OPTION_LIST before_options, RECODE_CONST_OPTION_LIST after_options) { rtf_default_encoding = RTF_PCA; if (!init_common (step, request, before_options, after_options)) return false; else rtf_default_charset = pca_charset; return true; } static bool init_rtf_unicode_ucs2 (RECODE_STEP step, RECODE_CONST_REQUEST request, RECODE_CONST_OPTION_LIST before_options, RECODE_CONST_OPTION_LIST after_options) { rtf_default_encoding = RTF_UNICODE; if (!init_common (step, request, before_options, after_options)) return false; else rtf_default_charset = ansi_charset; return true; } /*----------------------------------------. | A simple LIFO stack keeps track of the | | state in the groups of the RTF file... | `----------------------------------------*/ typedef struct tStackPost tStackPost; struct tStackPost { int stacked_rtf_encoding; RECODE_SYMBOL stacked_rtf_charset; int stacked_ucodelevel; bool stacked_mode_dbcs; bool stacked_mode_dbcs_first; bool stacked_fonttbl_read_state; bool stacked_discard_group; bool stacked_read_field; tStackPost *next; }; static tStackPost* stack_push (tStackPost *listp, int rtf_encoding, RECODE_SYMBOL rtf_charset, int ucodelevel, bool mode_dbcs, bool mode_dbcs_first, bool fonttbl_read_state, bool discard_group, bool read_field) { tStackPost *newp; newp = (tStackPost *) malloc(sizeof(tStackPost)); newp->stacked_rtf_encoding = rtf_encoding; newp->stacked_rtf_charset = rtf_charset; newp->stacked_ucodelevel = ucodelevel; newp->stacked_mode_dbcs = mode_dbcs; newp->stacked_mode_dbcs_first = mode_dbcs_first; newp->stacked_fonttbl_read_state = fonttbl_read_state; newp->stacked_discard_group = discard_group; newp->stacked_read_field = read_field; newp->next = NULL; if(listp == NULL) { listp = newp; return listp; } newp->next = listp; listp = newp; return listp; } static tStackPost* stack_pull (tStackPost *listp, int *rtf_encoding, RECODE_SYMBOL *rtf_charset, int *ucodelevel, bool *mode_dbcs, bool *mode_dbcs_first, bool *fonttbl_read_state, bool *discard_group, bool *read_field) { tStackPost *p; if (listp == NULL) return listp; *rtf_encoding = listp->stacked_rtf_encoding; *rtf_charset = listp->stacked_rtf_charset; *ucodelevel = listp->stacked_ucodelevel; *mode_dbcs = listp->stacked_mode_dbcs; *mode_dbcs_first = listp->stacked_mode_dbcs_first; *fonttbl_read_state = listp->stacked_fonttbl_read_state; *discard_group = listp->stacked_discard_group; *read_field = listp->stacked_read_field; p = listp; listp = listp->next; free(p); return listp; } static void stack_free (tStackPost *listp) { tStackPost *next; for ( ; listp != NULL; listp = next) { next = listp->next; free (listp); } } /*----------------------------------------. | Font table with its associated charsets | `-----------------------------------------*/ /* * FIXME: theoretically this should be a hash table * but the typical size of a font table sort of makes * it overkill. Decide. */ typedef struct tFontPost tFontPost; struct tFontPost { unsigned font_id; RECODE_SYMBOL charset; char bytes; tFontPost *next; }; static tFontPost* add_font (tFontPost *listp, unsigned font_id, RECODE_SYMBOL default_charset) { tFontPost *newp; newp = (tFontPost *) malloc(sizeof(tFontPost)); newp->font_id = font_id; newp->charset = default_charset; newp->bytes = 0; newp->next = NULL; if(listp == NULL) { listp = newp; return listp; } newp->next = listp; listp = newp; return listp; } static tFontPost* associate_charset_to_font (RECODE_OUTER outer, tFontPost *listp, unsigned f) { RECODE_SYMBOL charset; RECODE_ALIAS alias; struct charpost *cursor; /* Select charset for this font */ charset = NULL; for (cursor = charsets; cursor->bytes; cursor++) { if (cursor->windef_code == f) { if (*cursor->recode_charset_name) { alias = find_alias (outer, cursor->recode_charset_name, ALIAS_FIND_AS_CHARSET); if (alias) charset = alias->symbol; else /* FIXME: Error or silently ignored? */ { printf("Fonttable: could not find alias for %s!\n", cursor->recode_charset_name); } break; } else { /* FIXME: Error for "fonttable contains non-translatable charset" ? printf("Fonttable contains untranslatable windows charset %d!\n", cursor->windef_code); printf("(But it may not be used in the file...)\n"); */ break; } } } if(listp == NULL) { /* FIXME: return error, panic, whatever */ return listp; } listp->charset = charset; listp->bytes = cursor->bytes; return listp; } static void get_charset_from_font (tFontPost *listp, unsigned f, RECODE_SYMBOL *rtf_charset, bool *mode_dbcs) { tFontPost *p; p = listp; while (p != NULL) { if (p->font_id == f) { *rtf_charset = p->charset; *mode_dbcs = (p->bytes == 2); break; } p = p->next; } } static void font_free (tFontPost *listp) { tFontPost *next; for ( ; listp != NULL; listp = next) { next = listp->next; free (listp); } } /*-------------------------------. | Accessing a codepage directly | `--------------------------------*/ static void get_charset_from_codepage (RECODE_OUTER outer, unsigned cp, RECODE_SYMBOL *rtf_charset, bool *mode_dbcs) { RECODE_SYMBOL charset; RECODE_ALIAS alias; bool dbcs; struct charpost *cursor; /* Select charset for this codepage */ charset = rtf_default_charset; for (cursor = charsets; cursor->bytes; cursor++) { if (cursor->codepage == cp) { if (*cursor->recode_charset_name) { alias = find_alias (outer, cursor->recode_charset_name, ALIAS_FIND_AS_CHARSET); if (alias) { charset = alias->symbol; dbcs = (cursor->bytes == 2); } else /* FIXME: Error or silently ignored? */ { printf("Could not find alias for %s!\n", cursor->recode_charset_name); } break; } else { printf("No translation available for codepage %d!\n", cp); break; } } } *rtf_charset = charset; *mode_dbcs = dbcs; } /*-----------------------------------------. | Translation routine. Once code_to_ucs2() | | works for any character, this will be a | | lot simpler. | `-----------------------------------------*/ static bool translate (RECODE_SYMBOL rtf_charset, RECODE_REQUEST local_request, unsigned *value, bool *mode_discard_group) { RECODE_TASK task = NULL; bool enc_succeed; char current_cp_approx; static char local_request_string[127] = ""; static char temp_request_string[127]; static char temp_string[3]; unsigned foo; if (!rtf_charset) { /* This is untranslatable, signal and set discard for this group, * in order to avoid further tries to translate from the invalid * charset. */ *mode_discard_group = true; return false; } /* If the charset is not easily translatable, we fall back on a request... */ else if (!rtf_charset->data) { sprintf(temp_request_string, "%s..ucs2", rtf_charset->name); /* If this is the first request, or different from the last one */ if (!local_request_string || strcmp(temp_request_string, local_request_string)) { strcpy(local_request_string, temp_request_string); if (!recode_scan_request(local_request, local_request_string)) { return false; } } /* Somebody explain to me why temp_string[0] = (unsigned char) *value >> 8 does not work */ foo = *value >> 8; temp_string[0] = (unsigned char) foo; temp_string[1] = (unsigned char) *value & 255; temp_string[2] = '\0'; /* Recode using a Task */ task = recode_new_task(local_request); if (!task) /* The error will be handled below */ *value = 0xFFFF; else { task->input.buffer = temp_string; task->input.cursor = temp_string; task->input.limit = temp_string + 2; task->output.buffer = NULL; task->output.cursor = NULL; task->output.limit = NULL; task->strategy = RECODE_SEQUENCE_IN_MEMORY; task->fail_level = RECODE_AMBIGUOUS_OUTPUT; enc_succeed = recode_perform_task (task); /* the function does not always behave as it should, eg CP950 */ if (enc_succeed && task->output.buffer) { /* ucs-2 are always two bytes */ *value = (unsigned) task->output.buffer[0] << 8; *value = *value | (unsigned) task->output.buffer[1]; } else { *value = 0xFFFF; } if (task->output.limit - task->output.buffer) free (task->output.buffer); recode_delete_task (task); } } else /* Otherwise this step is simple. */ *value = code_to_ucs2(rtf_charset, *value); return (*value != 0xFFFF); } /*---------------------------------. | Transformation from RTF to UCS2 | `----------------------------------*/ static bool transform_rtf_ucs2 (RECODE_SUBTASK subtask) { RECODE_CONST_REQUEST request = subtask->task->request; RECODE_OUTER outer = subtask->task->request->outer; RECODE_REQUEST local_request = recode_new_request(outer); RECODE_SYMBOL rtf_charset; int input_char; int rtf_encoding; char *cCurrentCharset; /* Modes */ bool mode_read_fonttbl = false; bool mode_add_trail_space = false; /* to add trailing spaces after alpha only keywords */ bool mode_discard_group = false; bool mode_read_field = false; bool mode_dbcs = false; /* handle double-byte characters */ bool mode_dbcs_first = false; /* Number of bytes used in char representation and related unicode problems */ unsigned dbcs_character; bool ucode_consume_next = false; int ucodelevel = 1; int cnt_ucodelevel = 1; /* Our state stack */ tStackPost *stacklistp = NULL; /* Out font list */ tFontPost *fontlistp = NULL; char field_buffer[256]; /* Holder for Fields */ char *field_cursor; struct codepost *cursor3; bool found; /* Send the remainder this group to null */ rtf_encoding = rtf_default_encoding; rtf_charset = rtf_default_charset; input_char = get_byte (subtask); while (input_char != EOF) { /*------------------------. | Groups and state stack | `------------------------*/ if (input_char == '{') { /* Push current state on stack */ stacklistp = stack_push(stacklistp, rtf_encoding, rtf_charset, ucodelevel, mode_dbcs, mode_dbcs_first, mode_read_fonttbl, mode_discard_group, mode_read_field); rtf_charset = rtf_default_charset; if (request->diacritics_only) { mode_add_trail_space = false; put_ucs2('{', subtask); } input_char = get_byte(subtask); } else if (input_char == '}') { /* Pull state from stack */ stacklistp = stack_pull(stacklistp, &rtf_encoding, &rtf_charset, &ucodelevel, &mode_dbcs, &mode_dbcs_first, &mode_read_fonttbl, &mode_discard_group, &mode_read_field); if (request->diacritics_only) { mode_add_trail_space = false; put_ucs2('}', subtask); } input_char = get_byte(subtask); } /*------------------------------------------. | Detection of command words + handling of | | short (1-2 character) commands. | `------------------------------------------*/ else if (input_char == '\\') { input_char = get_byte (subtask); /* Escaped characters * Some RTF writers treat \: as just a colon (as in \{ eg) * but it is actually a keyword. This implementation treats * it as a keyword. */ if (input_char == '\\' || input_char == '{' || input_char == '}') { if (request->diacritics_only) put_ucs2 ('\\', subtask); put_ucs2 (input_char, subtask); input_char = get_byte (subtask); } else if (input_char == '*') { if (request->diacritics_only) { put_ucs2 ('\\', subtask); put_ucs2 ('*', subtask); } else mode_discard_group = true; mode_add_trail_space = false; input_char = get_byte (subtask); } else if (input_char == ':' || input_char == '|' || input_char == 0x0A || input_char == 0x0D) { mode_add_trail_space = false; if (request->diacritics_only) { put_ucs2 ('\\', subtask); put_ucs2 (input_char, subtask); } input_char = get_byte (subtask); } else if (input_char == '~') { put_ucs2 (0xA0, subtask); input_char = get_byte (subtask); } else if (input_char == '-') { put_ucs2 (0xAD, subtask); input_char = get_byte (subtask); } else if (input_char == '_') { put_ucs2 (0x2011, subtask); input_char = get_byte (subtask); } /* Scan for two-digit hex notation */ else if (input_char == '\'') { unsigned value = 0; int chars = 0; /* Scan \'[0-9a-fA-F]{2} notation. */ while (chars != 2) { input_char = get_byte (subtask); if (input_char >= '0' && input_char <= '9') value = 16 * value + input_char - '0'; else if (input_char >= 'A' && input_char <= 'F') value = 16 * value + input_char - 'A' + 10; else if (input_char >= 'a' && input_char <= 'f') value = 16 * value + input_char - 'a' + 10; else break; chars++; } /* Convert value from desired charset to ucs2 * abort if untranslatable & user wants this. */ if (!ucode_consume_next && !mode_discard_group) { if (mode_dbcs && !mode_dbcs_first && value >= 0x80) { /* First byte of double byte character */ dbcs_character = value << 8; mode_dbcs_first = true; } else { if (mode_dbcs && mode_dbcs_first) { /* Second byte of double byte character */ value = dbcs_character + value; mode_dbcs_first = false; } /* Translate the character */ if (!translate (rtf_charset, local_request, &value, &mode_discard_group)) /* This takes care of any recoding errors */ { if (recode_if_nogo(RECODE_UNTRANSLATABLE, subtask)) { stack_free(stacklistp); font_free(fontlistp); recode_delete_request(local_request); SUBTASK_RETURN (subtask); } } else { if (mode_add_trail_space && value >= '@') put_ucs2 (' ', subtask); mode_add_trail_space = false; put_ucs2 (value, subtask); } } } else if (ucode_consume_next) { cnt_ucodelevel--; if (!cnt_ucodelevel) ucode_consume_next = false; } input_char = get_byte (subtask); } /*-------------------------------------. | Handle long command words and their | | parameters. | `-------------------------------------*/ else { char buffer1[KEYWORD_BUFFER_LENGTH]; /* Holder for Keyword */ char buffer2[KEYWORD_BUFFER_LENGTH]; /* Holder for Keyword parameter */ char *cursor1 = buffer1; char *cursor2 = buffer2; *cursor1 = '\0'; *cursor2 = '\0'; /* Read the alpha keyword */ while (input_char >= 'a' && input_char <= 'z') { *cursor1++ = input_char; *cursor1 = '\0'; input_char = get_byte (subtask); } /* * Read in the parameter, if any. * only the very first character may be a '-' */ if ((input_char == '-' && strlen(buffer2) == 0) || (input_char >= '0' && input_char <= '9')) { while (input_char == '-' || (input_char >= '0' && input_char <= '9')) { *cursor2++ = input_char; *cursor2 = '\0'; input_char = get_byte (subtask); } } /* If a space follows the keyword or parameter, consume it. */ if (input_char == ' ') input_char = get_byte (subtask); /* Unicode handling */ if (!strcmp(buffer1, "uc")) { /* Number of bytes used by the approximation following * a unicode character, mostly on the form \'hh\'hh... */ ucodelevel = (unsigned) atoi(buffer2); } else if (!strcmp(buffer1, "u")) { if (rtf_encoding == RTF_UNICODE) { unsigned value = 0; value = (unsigned) atoi(buffer2); if (value > 0 && !mode_discard_group) put_ucs2(value, subtask); /* Consume the next char which is an ANSI approximation * ucodelevel contains the number of bytes to consume, * taken from the \uc keyword */ cnt_ucodelevel = ucodelevel; ucode_consume_next = true; } else { /* Skip the unicode characters, abort if desired */ if (recode_if_nogo(RECODE_INVALID_INPUT, subtask)) { stack_free(stacklistp); font_free(fontlistp); recode_delete_request(local_request); SUBTASK_RETURN (subtask); } } } /*----------------------------------------. | Scan the table of special chars to see | | if the command word appears there. | `----------------------------------------*/ /* * FIXME: * Is the table big enough to be faster * if we use a hash lookup instead? */ else { found = false; for (cursor3 = translations; cursor3->code; cursor3++) { if (!strcmp(buffer1,cursor3->string)) { if (mode_add_trail_space) { put_ucs2(' ', subtask); mode_add_trail_space = false; } put_ucs2(cursor3->code, subtask); found = true; break; } } /* * This is where we end up with no apparent translations are * provided for the keyword... * * Recode will handle some special keywords that are needed * to map characters onto non-standard charsets. This is * achieved by building the font table in memory, IF it is * present in the stream. */ if (!found) { /*---------------------------------------. | Font and codepage switching functions | `---------------------------------------*/ /* First, see if it is even time to switch default font */ if (!strcmp(buffer1, "ansi")) { /* Make CP1252 encoding the default */ rtf_default_encoding = RTF_UNICODE; rtf_encoding = rtf_default_encoding; rtf_default_charset = ansi_charset; rtf_charset = rtf_default_charset; } else if (!strcmp(buffer1, "mac")) { /* Make macintosh endoding the default */ rtf_default_encoding = RTF_MAC; rtf_encoding = rtf_default_encoding; rtf_default_charset = mac_charset; rtf_charset = rtf_default_charset; } else if (!strcmp(buffer1, "pc")) { /* Make IBM CP 437 the default */ rtf_default_encoding = RTF_PC; rtf_encoding = rtf_default_encoding; rtf_default_charset = pc_charset; rtf_charset = rtf_default_charset; } else if (!strcmp(buffer1, "pca")) { /* Make IBM CP 850 the default */ rtf_default_encoding = RTF_PCA; rtf_encoding = rtf_default_encoding; rtf_default_charset = pca_charset; rtf_charset = rtf_default_charset; } else if (!strcmp(buffer1, "fonttbl")) { /************************************ * Go into font table reading state * the only way of leaving this mode * is by the old state being pulled * from the stack. All font data will * be consumed! * FIXME: Perhaps a more exakt * font table parsing algorithm is * needed for parsing Windings, Symbols * and the like more exactly? ************************************/ if (!request->diacritics_only) mode_discard_group = true; mode_read_fonttbl = true; } else if (!strcmp(buffer1, "f")) { /* Read a font, or switch to a certain font */ unsigned font = 0; font = (unsigned) atoi(buffer2); if (mode_read_fonttbl) /* Add font to table, using current charset as default */ fontlistp = add_font (fontlistp, font, rtf_charset); else { get_charset_from_font (fontlistp, font, &rtf_charset, &mode_dbcs); mode_dbcs_first = false; } /* Clear buffer to remove command from output */ if (!request->diacritics_only) { *buffer1 = '\0'; *buffer2 = '\0'; } } else if (!strcmp(buffer1, "ftech") && mode_read_fonttbl) { /* Sometimes the Symbol font is referenced this way */ mode_discard_group = true; *field_buffer = '\0'; field_cursor = field_buffer; while (input_char == ' ') input_char = get_byte(subtask); while (input_char != EOF && input_char != '}') { if (input_char >= 0x40) /* Do not accumulate trailing ; */ { *field_cursor = input_char; field_cursor++; *field_cursor = '\0'; } input_char = get_byte(subtask); } if (!strcmp(field_buffer, "Symbol")) { /* Associate it with the symbol font */ fontlistp = associate_charset_to_font (outer, fontlistp, 2); } } else if (!strcmp(buffer1, "fcharset") && mode_read_fonttbl) { /* Associate a charset with the current * position in the font table */ unsigned charset = 0; charset = (unsigned) atoi(buffer2); fontlistp = associate_charset_to_font (outer, fontlistp, charset); } else if (!strcmp(buffer1, "cpg") || !strcmp(buffer1, "ansicpg")) { /* Just switch to this codepage, if applicable */ get_charset_from_codepage (outer, (unsigned) atoi(buffer2), &rtf_charset, &mode_dbcs); } /*------------. | Paragraphs | `------------*/ else if (!strcmp(buffer1, "par")) { /* How does this match up with surfaces? */ put_ucs2(0x0D, subtask); put_ucs2(0x0A, subtask); } /*-------------------------------------------. | Functionality to process insertion fields | `------------------------------------------*/ else if (!strcmp(buffer1, "field")) { mode_read_field = true; mode_discard_group = true; *field_buffer = '\0'; field_cursor = field_buffer; } else if (!strcmp(buffer1, "fldinst") && mode_read_field) { while (input_char == ' ') input_char = get_byte(subtask); while (input_char != EOF && input_char != '}') { if (input_char >= 0x20) { *field_cursor = input_char; field_cursor++; *field_cursor = '\0'; } input_char = get_byte(subtask); } } else if (!strcmp(buffer1, "fldrslt") && mode_read_field) { field_cursor = strtok(field_buffer, " "); if (!strcmp(field_cursor, "SYMBOL")) { /* FIXME: * SYMBOL FONTS. None of these fonts are * in recode as of yet, but should be handled in the future. * Note that Microsoft have not submitted mappings for the * symbol font(s) to the Unicode consortium. */ /* Get the symbol from the appropriate symbol font */ unsigned symbol = 0; field_cursor = strtok(NULL, " "); symbol = (unsigned) atoi(field_cursor); printf("Found symbol: %d\n", symbol); field_cursor = strtok(NULL, " "); field_cursor = strtok(NULL, " "); if (*field_cursor == '\"') { field_cursor++; field_cursor[strlen(field_cursor)-1] = '\0'; } printf("Font: %s is not yet in Recode.\n", field_cursor); /* Add conversion here when symbol font is in recode */ } } /*------------------------------------. | Commands that are silently ignored | `------------------------------------*/ /* These are commands (destinations) to skip if consuming RTF */ else if ((!strcmp(buffer1, "colortbl") || !strcmp(buffer1, "stylesheet") || !strcmp(buffer1, "pict") || !strcmp(buffer1, "shp") || !strcmp(buffer1, "footnote") || !strcmp(buffer1, "header") || !strcmp(buffer1, "headerl") || !strcmp(buffer1, "headerr") || !strcmp(buffer1, "headerf") || !strcmp(buffer1, "footer") || !strcmp(buffer1, "footerl") || !strcmp(buffer1, "footerr") || !strcmp(buffer1, "footerf") || !strcmp(buffer1, "ftnsep") || !strcmp(buffer1, "ftnsepc") || !strcmp(buffer1, "ftncn") || !strcmp(buffer1, "comment") || /* info group is not discarded */ !strcmp(buffer1, "title") || !strcmp(buffer1, "subject") || !strcmp(buffer1, "author") || !strcmp(buffer1, "operator") || !strcmp(buffer1, "keywords") || !strcmp(buffer1, "doccomm") || !strcmp(buffer1, "version") || !strcmp(buffer1, "nextfile")) && !request->diacritics_only) { mode_discard_group = true; } else if (!strcmp(buffer1, "upr") && (rtf_encoding == RTF_UNICODE) && !request->diacritics_only) { mode_discard_group = true; } /* Reactivate group for Unicode destinations */ else if (!strcmp(buffer1, "ud") && (rtf_encoding == RTF_UNICODE)) { mode_discard_group = false; } /*----------------------------------------------------. | For all commands: | | just duplicate the keyword and parameter on output | | in case we're just running diacritics only. | `----------------------------------------------------*/ if (request->diacritics_only && !mode_discard_group) { mode_add_trail_space = false; put_ucs2 ('\\', subtask); for (cursor1 = buffer1; *cursor1; cursor1++) put_ucs2 (*cursor1, subtask); if (!*buffer2) mode_add_trail_space = true; else { for (cursor2 = buffer2; *cursor2; cursor2++) put_ucs2 (*cursor2, subtask); } } } } } } else /*-----------------------------------------. | If this was not a command word, process | | the plaintext character. | `-----------------------------------------*/ { if (!ucode_consume_next) { if (!mode_discard_group) { if (mode_add_trail_space && input_char != '\\' && input_char != '{' && input_char != '}' && input_char != ';') put_ucs2 (' ', subtask); mode_add_trail_space = false; /* CP1251 < 0x20 is silently ignored, unless we're running * diacritics only */ if (input_char >= 0x20 || request->diacritics_only) { unsigned value = (unsigned) input_char; /* Non-DBCS characters in the lower region are in rtf_default_charset */ if (!translate (mode_dbcs ? rtf_default_charset : rtf_charset, local_request, &value, &mode_discard_group)) { if (recode_if_nogo(RECODE_UNTRANSLATABLE, subtask)) { stack_free(stacklistp); font_free(fontlistp); recode_delete_request(local_request); SUBTASK_RETURN (subtask); } } else { put_ucs2 (value, subtask); } } } } else { if (input_char >= 0x20) cnt_ucodelevel--; if (!cnt_ucodelevel) ucode_consume_next = false; } input_char = get_byte (subtask); } } /* Free state stack and font table before exiting */ stack_free(stacklistp); font_free(fontlistp); recode_delete_request(local_request); SUBTASK_RETURN (subtask); } /*--------------------. | Module declaration | `--------------------*/ bool module_rtf (RECODE_OUTER outer) { return declare_single (outer, "ISO-10646-UCS-2", "RTF_1.0-ANSI", outer->quality_byte_to_variable, init_ucs2_rtf_ansi, transform_ucs2_rtf) && declare_single (outer, "RTF_1.0-ANSI", "ISO-10646-UCS-2", outer->quality_variable_to_byte, init_rtf_ansi_ucs2, transform_rtf_ucs2) && declare_single (outer, "ISO-10646-UCS-2", "RTF_1.0-MAC", outer->quality_byte_to_variable, init_ucs2_rtf_mac, transform_ucs2_rtf) && declare_single (outer, "RTF_1.0-MAC", "ISO-10646-UCS-2", outer->quality_variable_to_byte, init_rtf_mac_ucs2, transform_rtf_ucs2) && declare_single (outer, "ISO-10646-UCS-2", "RTF_1.0-PC", outer->quality_byte_to_variable, init_ucs2_rtf_pc, transform_ucs2_rtf) && declare_single (outer, "RTF_1.0-PC", "ISO-10646-UCS-2", outer->quality_variable_to_byte, init_rtf_pc_ucs2, transform_rtf_ucs2) && declare_single (outer, "ISO-10646-UCS-2", "RTF_1.0-PCA", outer->quality_byte_to_variable, init_ucs2_rtf_pca, transform_ucs2_rtf) && declare_single (outer, "RTF_1.0-PCA", "ISO-10646-UCS-2", outer->quality_variable_to_byte, init_rtf_pca_ucs2, transform_rtf_ucs2) && declare_single (outer, "ISO-10646-UCS-2", "RTF_1.0-UNICODE", outer->quality_byte_to_variable, init_ucs2_rtf_unicode, transform_ucs2_rtf) && declare_single (outer, "RTF_1.0-UNICODE", "ISO-10646-UCS-2", outer->quality_variable_to_byte, init_rtf_unicode_ucs2, transform_rtf_ucs2) /* Defaults to most powerful encoding */ && declare_alias (outer, "rtf", "RTF_1.0-UNICODE") && declare_alias (outer, "rtfansi", "RTF_1.0-ANSI") && declare_alias (outer, "rtfmac", "RTF_1.0-MAC") && declare_alias (outer, "rtfpc", "RTF_1.0-PC") && declare_alias (outer, "rtfcp437", "RTF_1.0-PC") && declare_alias (outer, "rtfpca", "RTF_1.0-PCA") && declare_alias (outer, "rtfcp850", "RTF_1.0-PCA") && declare_alias (outer, "rtfu", "RTF_1.0-UNICODE"); } void delmodule_rtf (RECODE_OUTER outer) { }