/* * file: utf8.l * * required flex option: * * -8 (generate 8bit scanner) */ %{ #ifdef __MSDOS__ # include # include # include #else # include #endif #include #include #include #include int printcode = 0; char *programname; void get_programname (char const *argv0), syntax (void), errit (char const *format, ...), bytes2 (void), bytes3 (void), bytes4 (void), bytes5 (void), bytes6 (void), outchar (long unsigned); #define YY_NO_UNPUT #define YY_SKIP_YYWRAP #ifdef yywrap # undef yywrap #endif int yywrap() { return 1; } %} %% [\300-\337]. { bytes2 (); } [\340-\357].. { bytes3 (); } [\360-\367]... { bytes4 (); } [\370-\373].... { bytes5 (); } [\374-\375]..... { bytes6 (); } %% void bytes2 () { unsigned u [2], c; int i; for (i = 0; i < 2; i++) u [i] = (unsigned char) yytext [i]; c = ( u [1] & 0x3F) | ((u [0] & 0x1F) << 6); outchar (c); } void bytes3 () { unsigned u [3], c; int i; for (i = 0; i < 3; i++) u [i] = (unsigned char) yytext [i]; c = ( u [2] & 0x3F) | ((u [1] & 0x3F) << 6) | ((u [0] & 0x0F) << 12); outchar (c); } void bytes4 () { long unsigned u [4], c; int i; for (i = 0; i < 4; i++) u [i] = (unsigned char) yytext [i]; c = ( u [3] & 0x3F) | ((u [2] & 0x3F) << 6) | ((u [1] & 0x3F) << 12) | ((u [0] & 0x07) << 18); outchar (c); } void bytes5 () { long unsigned u [5], c; int i; for (i = 0; i < 5; i++) u [i] = (unsigned char) yytext [i]; c = ( u [4] & 0x3F) | ((u [3] & 0x3F) << 6) | ((u [2] & 0x3F) << 12) | ((u [1] & 0x3F) << 18) | ((u [0] & 0x03) << 24); outchar (c); } void bytes6 () { long unsigned u [6], c; int i; for (i = 0; i < 6; i++) u [i] = (unsigned char) yytext [i]; c = ( u [5] & 0x3F) | ((u [4] & 0x3F) << 6) | ((u [3] & 0x3F) << 12) | ((u [2] & 0x3F) << 18) | ((u [1] & 0x3F) << 24) | ((u [0] & 0x01) << 30); outchar (c); } void outchar (long unsigned c) { int i; char *s; /* * iso-8859-1 */ if (c < 256) { fputc (c, stdout); return; } /* * iso-8859-15 */ i = 0; switch (c) { case 0x20Ac: i = 0xA4; break; /* euro */ case 0x0160: i = 0xA6; break; /* S caron */ case 0x0161: i = 0xA8; break; /* s caron */ case 0x017D: i = 0xB4; break; /* Z caron */ case 0x017E: i = 0xB8; break; /* z caron */ case 0x0152: i = 0xBC; break; /* OE ligature */ case 0x0153: i = 0xBD; break; /* oe ligature */ case 0x0178: i = 0xBE; break; /* Y diaeresis */ } if (i) { fputc (i, stdout); return; } /* * substitutions */ s = NULL; switch (c) { case 0x0132: s = "IJ"; break; case 0x0133: s = "ij"; break; } if (s) { fputs (s, stdout); return; } if (printcode) { if (c < 0x10000) printf ("U+%04X", (unsigned) c); else printf ("U+%08lX", c); } else fputc (191, stdout); } int main (int argc, char *argv []) { get_programname (argv [0]); while (argc > 1) if (! strcmp (argv [1], "-c")) { printcode = 1; argv++; argc--; } else break; switch (argc) { case 1: if (isatty (fileno (stdin))) syntax (); yyin = stdin; break; case 2: yyin = fopen (argv [1], "r"); if (! yyin) errit ("Opening file \"%s\": %s", argv [1], strerror (errno)); break; default: syntax (); } yylex (); if (yyin != stdin) fclose (yyin); return 0; } void get_programname (char const *argv0) { #ifdef __MSDOS__ char name [MAXFILE]; fnsplit (argv0, NULL, NULL, name, NULL); programname = strdup (name); #else /* unix */ char *p; p = strrchr (argv0, '/'); if (p) programname = strdup (p + 1); else programname = strdup (argv0); #endif } void errit (char const *format, ...) { va_list list; fprintf (stderr, "\nError %s: ", programname); va_start (list, format); vfprintf (stderr, format, list); fprintf (stderr, "\n\n"); exit (1); } void syntax () { fprintf ( stderr, "\n" "Syntax: %s [-c] [utf-8 encoded file]\n" "\n" "The file will be translated to iso-8859-1 *and* iso-8859-15\n" "\n" " -c : print U+code for characters not in iso-8859-1/15\n" "\n", programname ); exit (1); }