% \iffalse meta-comment % % Copyright (C) 2026 Alan J. Cain % % This file may be distributed and/or modified under the conditions of the LaTeX Project Public License, either version % 1.3c of this license or (at your option) any later version. The latest version of this license is in: % % http://www.latex-project.org/lppl.txt % % and version 1.3c or later is part of all distributions of LaTeX version 2008-05-04 or later. % % \fi % % \iffalse %<*driver> \PassOptionsToPackage{inline}{enumitem} \documentclass{l3doc} \usepackage{polyglossia} \setmainlanguage[variant=british]{english} \makeatletter \ExplSyntaxOn \cs_gset:Npn \l@subsection { \@dottedtocline{2}{2.5em}{2.8em} } % #2 = 1.5em \cs_gset:Npn \l@subsubsection { \@dottedtocline{3}{5.3em}{3.5em} } % #2 = 1.5em \cs_gset:Npn \l@paragraph { \@dottedtocline{4}{8.8em}{3.2em} } % #2 = 1.5em \ExplSyntaxOff \makeatother \usepackage{xcolor} \definecolor{linkcolor}{rgb}{0.0,0.4,0.7} \colorlet{citecolor}{linkcolor} \colorlet{urlcolor}{linkcolor} \hypersetup{ linkcolor=linkcolor,% citecolor=citecolor,% urlcolor=urlcolor,% } \usepackage{xurl} \renewcommand*\UrlBigBreaks{} \newcommand*\fullref[2]{% \hyperref[#2]{#1\penalty 200\ \ref*{#2}}% } \newcommand*\fullpageref[1]{% \hyperref[#1]{page\penalty 200\ \pageref*{#1}}% } \setcounter{tocdepth}{7} \numberwithin{figure}{section} \usepackage{lua-list-hyphen} \usepackage{lipsum} \usepackage{tikz} \newcommand*\key[1]{\texttt{#1}} \newcommand*\val[1]{\texttt{#1}} \newcommand*\keyvalue[2]{\texttt{#1=#2}} \newlist{vallist}{description}{1} \setlist[vallist]{ leftmargin=3em, style=unboxed, labelsep=1em, font=\descriptionitemcolon, nosep, } \newcommand*{\descriptionitemcolon}[1]{\kern 1em #1:} \NewDocumentCommand{\default}{ m }{(\textit{Default:}\nobreakspace #1)} \newcommand*\luafunc[1]{\texttt{#1}} \newcommand*\luavar[1]{\texttt{#1}} \newcommand*\prefixedurl[1]{\textsc{url}:~\url{#1}} \begin{document} \DocInput{lua-list-hyphen.dtx} \PrintIndex \end{document} % % \fi % % % % \GetFileInfo{lua-list-hyphen.sty} % % % % \title{^^A % \pkg{lua-list-hyphen} ^^A % --- Per-language listing of hyphenated words for Lua\LaTeX^^A % \footnote{This document describes \fileversion, last revised \filedate.}^^A % } % % \author{^^A % Alan J. Cain\footnote{\texttt{a.j.cain (AT) gmail.com}}^^A % } % % \date{Released \filedate} % % \maketitle % % % % \begin{abstract} % This Lua\LaTeX\ package writes each word that has been hyphenated across lines to a file, using a different file for % each language, for subsequent external checking. % \end{abstract} % % % % \tableofcontents % % % % \begin{documentation} % % % % \section{Introduction} % % \TeX's algorithm for finding points where a word can be hyphenated is good, but not perfect.\footnote{For a % description of the algorithm and its limitations, see Knuth's account in Appendix~H of \textit{The \TeX book} % (Addison-Wesley, 2021. ISBN:~\texttt{978-0-201-13447-6})} The present author writes in British English, where the % valid division points can depend on both the pronunciation of a word and its internal structure (and hence its % etymology). Currently, \TeX's pattern-based approach produces \textit{bio-lo-gic}, \textit{bio-logy}, % \textit{bio-lo-gist}, rather than the standard \textit{bio-logic}, \textit{biol-ogy}, % \textit{biolo-gist}.\footnote{See the \textit{New Oxford Spelling Dictionary}, which is the authority for word % divisions in British English (Oxford University Press, 2005. ISBN:~\texttt{978-0-19-860881-3}).} To deal with such % cases, at least a substantially larger number of patterns would be required than are available at present. There are % also various words where the valid division points in British English cannot be deduced from their spelling alone: for % instance, the verbs \textit{at-trib-ute}, \textit{pre-sent}, \textit{pro-duce}, \textit{re-cord} have different % division points from the orthographically identical nouns \textit{at-tri-bute}, \textit{pres-ent}, \textit{prod-uce}, % \textit{rec-ord}. For another example, compare \textit{cur-ric-ulum vitae} and \textit{school cur-ricu-lum}. % % Easy checking of the chosen hyphenations is desirable. With Lua\TeX, it is possible to extract the hyphenated words. % The Lua\LaTeX\ package \pkg{lua-check-hyphen} offers this facility. It checks hyphenated words against a whitelist, % visually flags unknown hyphenations, and writes unknown hyphenations to a file. But it was first written in 2012, when % Lua\TeX\ was at an earlier stage of development, and so it has certain problems, such as with words containing % ligatures. It also lacks multi-language support. % % This Lua\LaTeX\ package, \pkg{lua-list-hyphen}, uses some ideas from \pkg{lua-check-hyphen} but was written from % scratch to work with a modern Lua\TeX. It simply writes hyphenated words from each language to a separate file, so % that they can be checked (manually or by an external program). % % [The author has written a simple Python application \texttt{hyphenassist}\footnote{\textsc{url}: % \url{https://codeberg.org/ajcain/hyphenassist}.} that checks the listed hyphenations against a dictionary of valid % divisions and allows the user to quickly choose to add entries to the division dictionary, add hyphenation exceptions, % or ignore particular hyphenations. He has used this program in conjunction with code incorporated into this package to % check hyphenations in his own books.\footnote{In particular, \textit{Form \& Number: A History of Mathematical % Beauty}. \textsc{url}: \url{https://archive.org/details/cain_formandnumber_ebook_large}.}] % % % % \paragraph*{Licence.} \noindent\pkg{lua-list-hyphen} is released under the \LaTeX\ Project Public Licence v1.3c or % later.\footnote{\textsc{url}: \url{https://www.latex-project.org/lppl.txt}} % % % % \paragraph*{Acknowledgements.} The author thanks Keno Wehr for corrections and comments on the documentation. % % % % \paragraph*{Feature requests and bug reports} % % The development code and issue tracker are hosted at Codeberg.\footnote{\textsc{url}: % \url{https://codeberg.org/ajcain/lua-list-hyphen}} % % % % \section{Requirements} % % \pkg{lua-list-hyphen} requires % \begin{enumerate}[label={(\arabic*)}] % \item Lua\LaTeX, % \item a recent \LaTeX\ kernel with \pkg{expl3} support (any kernel version since 2020-02-02 should suffice). % \end{enumerate} % It does not depend on any other packages, but will interface with \pkg{babel} or \pkg{polyglossia} (if one of them is % loaded) to determine language names. % % % % \section{Installation} % % To install \pkg{lua-list-hyphen} manually, run \texttt{luatex lua-list-hyphen.ins} and copy % \texttt{lua-list-hyphen.sty} and \texttt{lua-list-hyphen.lua} to somewhere Lua\LaTeX\ can find them. % % % % \section{Getting started} % % Simply load the package; the hyphenated words are by default written to the file % \cs{jobname}\file{-}\meta{lang-id}\file{.hyph}, without being sorted or having duplicates removed. The \meta{lang-id} % is either a Lua\TeX\ numerical language~ID, or a \pkg{babel} or \pkg{polyglossia} name of the language, if one of % these packages is in use. The prefix \cs{jobname}\file{-} and the extension \file{.hyph} can be customized; see % \fullref{Section}{sec:options}. % % % % \section{Package options} % \label{sec:options} % % \DescribeOption{verbose} The boolean option \key{verbose} controls how much information is written to the file about % each hyphenated word. When \val{true}, for each hyphenated word, both the undivided original and the divided word are % written out, as well as the page number on which the hyphenated word appears (or, more precisely, begins) and the % undivided word in context (as specified by the \key{context} keys; see below). When \val{false}, only the hyphenated % word is written. \default{\val{false}} % % \DescribeOption{context} % \DescribeOption{context-before} % \DescribeOption{context-after} % Integer options controlling how many words before (\key{context-before}) and after (\key{context-after}) the % hyphenated word are written as context when \keyvalue{verbose}{true}. The key \key{context} is simply a shortcut for % setting \key{control-before} and \key{control-after} to the same value. \default{\val{2}} % % \medskip % \DescribeOption{unique} % The option \key{unique} controls removal of duplicates from the list of hyphenated words written out. It can be be % set to one of the following three values: % \begin{vallist} % \item[\val{none}] Duplicate hyphenations are not removed. % \item[\val{case}] Hyphenations that are duplicate (case-sensitively) are removed. In this case, the hyphenations % \texttt{geo-metry} and \texttt{Geo-metry} are considered to be distinct. % \item[\val{nocase}] Hyphenations that are duplicate (case-insensitively) are removed. In this case, the hyphenations % \texttt{geo-metry} and \texttt{Geo-metry} are considered to be duplicates. The case of each listed hyphenation % will be that of the first appearance of that hyphenation. % \end{vallist} % Note that removal of duplicates is unaffected by the page number or context that is written out when % \keyvalue{verbose}{true}. \default{\val{none}} % % \DescribeOption{sort} % The option \key{sort} controls sorting of the list of hyphenated words. It can be be % set to one of the following three values: % \begin{vallist} % \item[\val{none}] Hyphenations appear in the same order as the occur in the document, or, if duplicates are removed, % in the order of first appearance in the document. % \item[\val{case}] Hyphenations are sorted case-sensitively. In this case, \texttt{Geo-metry} precedes % \texttt{geo-meter}. % \item[\val{nocase}] Hyphenations are sorted case-insensitively. In this case, \texttt{geo-meter} precedes % \texttt{Geo-metry}. % \end{vallist} % \default{\val{none}} % % \DescribeOption{include-non-output} % Boolean option determining whether hyphenated words that are never written to the page are listed. (For instance, % a hyphenated word might occur in text that a package temporarily typesets into a box, measures, and then discards.) % \default{\val{false}} % % \medskip % The two options \key{prefix} and \key{extension} specify the files to which hyphenations are written. Between the % prefix and the extension is either a Lua\TeX\ numerical language~ID, or a \pkg{babel} or \pkg{polyglossia} % name of the language, if one of these packages is in use. % % \DescribeOption{prefix} % The \key{prefix} is the part of the file name to which the list of hyphenated words is written, before the % language~ID. % \default{\cs{jobname}\file{-} (note the hyphen).} % % \DescribeOption{extension} % The extension of the file (including the \file{.}) to which the list of hyphenated words for each language is written. % \default{\file{.hyph}} % % \medskip % \DescribeOption{debug} % The boolean option \key{debug} controls whether debugging information is written to the terminal. % \default{\val{false}} % % % % \section{Output format} % % Each output file begins with a header (each line of which begins with a ‘comment’ symbol \texttt{\%}) that includes % information about the language and the package options that were used. Each line of the remainder of the file % describes one hyphenation. % % When \keyvalue{verbose}{false}, the line contains only the hyphenated word. % % ^^A Use \symbol{34} in this paragraph since " is an active character. % When \keyvalue{verbose}{true}, the line contains the original undivided word, the hyphenated word, the page number % where the hyphenated appears (or, to be precise, begins), and the context in which the hyphenated word appears. Each % part of the output is padded so that they various lines align. The original and undivided words are separated by the % ASCII ‘arrow’ \texttt{->}; the page number is prefixed by \texttt{p.}; and the context is surrounded by (straight) % quotation marks \texttt{\symbol{34}}\kern .5em\texttt{\symbol{34}}. If the hyphenation was never written to the page, % \texttt{p.}\meta{page} is replaced by \texttt{}. (This can only happen with % \keyvalue{include-non-output}{true}.) % % % % \section{Usage notes} % % \subsection{Languages} % % To determine the language of a word, \pkg{lua-list-hyphen} looks at what language is applied at the first possible % hyphenation point, first considering the part of the word before it, then the part after it. In the (presumably rare) % case of a ‘mixed-language’ word like ‘near-Zugzwang’ being specified (using, for example, \pkg{babel}) with % \texttt{near-\cs{foreignlanguage}\{german\}\{Zugzwang\}}, it would be assigned to the language in which \hbox{‘near-’} % is set. % % Duplicates are removed within each language. If the same hyphenation occurs in two different languages, it will appear % in both files, regardless of the value of \key{unique}. % % % % \subsection{Limitations} % % \pkg{lua-list-hyphen} uses Lua\TeX's built-in Unicode functions for pattern matching and converting between upper and % lower case, which are based on the \texttt{slnunicode} library. This library has not been updated for some time and is % based on an out-of-date version of the Unicode standard. Thus there may be problems with languages added to Unicode % more recently. Hyphenated words from such languages should still be listed, but may contain extraneous characters % (such as adjacent punctuation) and may not be sorted correctly. Users may prefer to leave sorting and removal of % duplicates to an external program that adheres to the current Unicode standard. % % % % \end{documentation} % % % % \clearpage % \begin{implementation} % % % % \section{Implementation (\LaTeX\ package)} % % \begin{macrocode} %<*package> %<@@=lualisthyphen> % \end{macrocode} % % % % \subsection{Initial set-up} % % Package identification/version information. % \begin{macrocode} \NeedsTeXFormat{LaTeX2e}[2020-02-02] \ProvidesExplPackage{lua-list-hyphen}{2026-05-02}{0.3.10} {Listing hyphenated words for LuaLaTeX} % \end{macrocode} % Check that Lua\TeX\ is in use. % \begin{macrocode} \sys_if_engine_luatex:F { \msg_new:nnn{ lua-list-hyphen }{ lualatex_required } { LuaLaTeX~required.~Package~loading~will~abort. } \msg_critical:nn{ lua-list-hyphen }{ lualatex_required } } % \end{macrocode} % % % % \subsection{Options} % % \begin{macro}{\l_@@_verbose_bool} % Boolean option to indicate whether lists of hyphenations should be written verbosely. % \begin{macrocode} \keys_define:nn { lua-list-hyphen }{ verbose .bool_set:N = \l_@@_verbose_bool, } % \end{macrocode} % \end{macro} % % % % \begin{macro}{\l_@@_context_before_int,\l_@@_context_after_int} % Integer options to determine the number of words before and after a hyphenation shown as context in verbose output. % \begin{macrocode} \keys_define:nn { lua-list-hyphen }{ context-before .int_set:N = \l_@@_context_before_int, context-before .initial:n = { 2 }, context-after .int_set:N = \l_@@_context_after_int, context-after .initial:n = { 2 }, context .code:n = { \keys_set:nn{ lua-list-hyphen }{ context-before=#1, context-after=#1, } }, } % \end{macrocode} % \end{macro} % % % % \begin{macro}{\l_@@_unique_int} % Choice option to indicate whether lists of hyphenations should have duplicates removed, case-sensitively or % case-insensitively. % \begin{macrocode} \int_new:N\l_@@_unique_int \keys_define:nn { lua-list-hyphen }{ unique .choices:nn = { none, case, nocase }{ \int_set:Nn\l_@@_unique_int{ \l_keys_choice_int - 1 } }, } % \end{macrocode} % \end{macro} % % % % \begin{macro}{\l_@@_sort_int} % Choice option to indicate whether lists of hyphenations should be sorted, case-sensitively or case-insensitively. % \begin{macrocode} \int_new:N\l_@@_sort_int \keys_define:nn { lua-list-hyphen }{ sort .choices:nn = { none, case, nocase }{ \int_set:Nn\l_@@_sort_int{ \l_keys_choice_int - 1 } }, } % \end{macrocode} % \end{macro} % % % % % \begin{macro}{\l_@@_include_non_output_bool} % Boolean option to indicate whether lists of hyphenations should include those that are never output to the page. % \begin{macrocode} \keys_define:nn { lua-list-hyphen }{ include-non-output .bool_set:N = \l_@@_include_non_output_bool, } % \end{macrocode} % \end{macro} % % % % \begin{macro}{\l_@@_file_prefix_str} % String option for the prefix of files to which hyphenations are writtten. % \begin{macrocode} \keys_define:nn { lua-list-hyphen }{ prefix .str_set:N = \l_@@_file_prefix_str, prefix .initial:e = { \c_sys_jobname_str- }, } % \end{macrocode} % \end{macro} % % % % \begin{macro}{\l_@@_file_extension_str} % String option for the extension of files to which hyphenations are writtten. % \begin{macrocode} \keys_define:nn { lua-list-hyphen }{ extension .str_set:N = \l_@@_file_extension_str, extension .initial:n = { .hyph }, } % \end{macrocode} % \end{macro} % % % % \begin{macro}{ % \l_@@_debug_int % } % Option to specify whether debug information is written to the terminal. Not intended for end users. % \begin{macrocode} \int_new:N\l_@@_debug_int \keys_define:nn { lua-list-hyphen }{ debug .code:n = {\int_set_eq:NN\l_@@_debug_int\c_one_int} } % \end{macrocode} % \end{macro} % % % % \subsection{Processing package options} % % Process package options. % \begin{macrocode} \ProcessKeyOptions [ lua-list-hyphen ] % \end{macrocode} % % % % Convert boolean options to integers (which can be accessed from Lua). % \begin{macrocode} \int_new:N\l_@@_verbose_int \bool_if:NT\l_@@_verbose_bool { \int_set_eq:NN\l_@@_verbose_int\c_one_int } \int_new:N\l_@@_include_non_output_int \bool_if:NT\l_@@_include_non_output_bool { \int_set_eq:NN\l_@@_include_non_output_int\c_one_int } % \end{macrocode} % % % % \subsection{Lua backend} % % Load the Lua backend. % \begin{macrocode} \lua_now:n{ lualisthyphen = require('lua-list-hyphen') } % \end{macrocode} % % % % \subsection{Saving \pkg{babel} language names} % % At \texttt{enddocument/afterlastpage}, if possible save \pkg{babel}'s language names. (\pkg{polyglossia}'s names can % be found directly from Lua.) % \begin{macrocode} \hook_gput_code:nnn{ enddocument/afterlastpage }{ lua-list-hyphen } { \@@_babel_save_language_names: } % \end{macrocode} % % \begin{macro}{\@@_babel_save_language_names:} % If \pkg{babel} is in use, get language names from \cs{bbl@languages}. % \begin{macrocode} \cs_new:Npn \@@_babel_save_language_names: { \cs_if_exist:NT\bbl@languages { % \end{macrocode} % Iterate through \cs{bbl@languages} to get language names. Items stored in this macro are quadruples prefixed with % \cs{bbl@elt}, so locally redefine this latter macro to an auxiliary function that passes language ID/name pairs to % the Lua backend. % \begin{macrocode} \group_begin: \cs_set_eq:NN \bbl@elt \@@_babel_save_language_names_elt:nnnn \bbl@languages \group_end: } } % \end{macrocode} % \end{macro} % % \begin{macro}{\@@_babel_save_language_names_elt:nnnn} % Auxiliary function that takes a quadruple stored in \cs{bbl@languages} and passes language ID/name pairs to the Lua % backend. % \begin{macrocode} \cs_new:Npn \@@_babel_save_language_names_elt:nnnn #1#2#3#4 { \lua_now:n{ lualisthyphen.babel_save_language_name(#2,'#1') } } % \end{macrocode} % \end{macro} % % % % \subsection{Processing and writing hyphenation lists} % % At \texttt{enddocument/info}, process and output the hyphenations that have been found. % \begin{macrocode} \hook_gput_code:nnn{ enddocument/info }{ lua-list-hyphen } { \@@_process_write_hyphenation_lists:ee {\str_use:N\l_@@_file_prefix_str} {\str_use:N\l_@@_file_extension_str} } % \end{macrocode} % % % % \begin{macro}{\@@_process_write_hyphenation_lists:nn} % Sort the list of hyphenations into separate lists for each language, sort and deduplicate them as required, and % write them to files with prefix given in the first parameter and suffix in the second. % \begin{macrocode} \cs_new:Npn \@@_process_write_hyphenation_lists:nn #1#2 { \lua_now:e{ lualisthyphen.process_write_hyphenation_lists( '\luaescapestring{#1}', '\luaescapestring{#2}' ) } } \cs_generate_variant:Nn \@@_process_write_hyphenation_lists:nn { ee } % \end{macrocode} % \end{macro} % % % % \begin{macrocode} % % \end{macrocode} % % % % \section{Implementation (Lua backend)} % % \begin{macrocode} %<*lua> % \end{macrocode} % % % % \subsection{Debugging function} % % \begin{macro}[int]{debug} % Debugging function. Defined according to the package option \key{debug} to either do nothing or write debugging % information. % \begin{macrocode} local debug if tex.count['l__lualisthyphen_debug_int'] == 0 then debug = function(s) end else debug = function(s) print('lua-list-hyphen DEBUG: ' .. s) end end % \end{macrocode} % \end{macro} % % % % \subsection{Table key constants} % % Keys for tables containing hyphenatable/hyphenated word data. % \begin{macrocode} local KEY_TYPE = 'type' local KEY_WORD = 'word' local KEY_LANG = 'lang' local KEY_DIVISION = 'division' local KEY_INDEX = 'index' local KEY_CONTEXT = 'context' local KEY_PAGE = 'page' % \end{macrocode} % % % % \subsection{Segment type} % % Constants for types of segments found while scanning hlist before linebreaking. % \begin{macrocode} local SEGMENT_WORD = 0 local SEGMENT_SPACE = 1 local SEGMENT_MATH = 2 % \end{macrocode} % % % % \subsection{Node ID and subtype constants} % % Define constants for the node IDs that need to be recognized. % \begin{macrocode} local NODE_ID_HLIST = node.id('hlist') local NODE_ID_DISC = node.id('disc') local NODE_ID_GLUE = node.id('glue') local NODE_ID_KERN = node.id('kern') local NODE_ID_MARGIN_KERN = node.id('margin_kern') local NODE_ID_GLYPH = node.id('glyph') local NODE_ID_MATH = node.id('math') % \end{macrocode} % Define constants for the kern node subtypes that have to be recognized. (There seems to be no automatic way to get % the numerical value from the subtype text other than searching the \luavar{node.subtype(\meta{node type})} tables.) % \begin{macrocode} local NODE_KERN_SUBTYPE_FONTKERN local NODE_KERN_SUBTYPE_USERKERN for k,v in pairs(node.subtypes('kern')) do if v == 'fontkern' then NODE_KERN_SUBTYPE_FONTKERN = k elseif v == 'userkern' then NODE_KERN_SUBTYPE_USERKERN = k end end % \end{macrocode} % Define constants for the math node subtypes. % \begin{macrocode} local NODE_MATH_SUBTYPE_BEGIN local NODE_MATH_SUBTYPE_END for k,v in pairs(node.subtypes('math')) do if v == 'beginmath' then NODE_MATH_SUBTYPE_BEGIN = k elseif v == 'endmath' then NODE_MATH_SUBTYPE_END = k end end % \end{macrocode} % % % % \subsection{Output constants} % % Constants for output. % \begin{macrocode} local STR_MATH = '[MATH]' local STR_SPACE = ' ' local STR_SPACE_TWO = ' ' local STR_ARROW = ' -> ' local STR_PAGE_PREFIX = 'p.' local STR_PAGE_NONE = '' local STR_QUOTE_OPEN = '"' local STR_QUOTE_CLOSE = '"' % \end{macrocode} % % % % \subsection{Utility functions} % % \begin{macro}[int]{list_filter} % Take a list \luavar{t} and remove from it any elements for which the function % \luavar{f} does not return true. (The index \luavar{j} is always the destination index to which a ‘keep’ element % is moved.)\footnote{Code adapted from \url{https://stackoverflow.com/a/53038524}.} % \begin{macrocode} local function list_filter(t, f) local j = 1 local n = #t for i=1,n do if (f(t[i])) then if (i ~= j) then t[j] = t[i] t[i] = nil end j = j + 1 else t[i] = nil end end end % \end{macrocode} % \end{macro} % % % % \begin{macro}[int]{list_uniq} % Take a list \luavar{t} and remove from it adjacent elements for which the function \luavar{f} returns true. (The % index \luavar{j} is always the last ‘kept’ element.) % \begin{macrocode} local function list_uniq(t, f) local j = 1 local n = #t for i=2,n do if (f(t[i],t[j])) then t[i] = nil else j = i end end list_filter( t, function(a) return a end ) end % \end{macrocode} % \end{macro} % % % % \subsection{Getting text from nodes} % % Getting the components of the ligatures that have Unicode code points can be problematic, at least for some fonts, % so define a lookup table for these cases. % \begin{macrocode} local LIGATURE_TEXT = { [0xfb00] = 'ff', [0xfb01] = 'fi', [0xfb02] = 'fl', [0xfb03] = 'ffi', [0xfb04] = 'ffl', } % \end{macrocode} % % % % Cache to save table lookups when extracting text. % \begin{macrocode} local font_characters = {} % \end{macrocode} % % % % Extracting text from nodes uses two functions that call each other, so the names have to be defined ahead of time. % \begin{macrocode} local get_node_text local get_nodelist_text % \end{macrocode} % % % % \begin{macro}[int]{get_node_text} % Return the text content of a glyph node (which might be a normal glyph, a ligature, etc.). % \begin{macrocode} get_node_text = function(n) if n.id == NODE_ID_GLYPH then local ligature_text = LIGATURE_TEXT[n.char] if ligature_text ~= nil then return ligature_text elseif n.components then return get_nodelist_text(n.components) else -- See [https://tug.org/pipermail/luatex/2018-March/006786.html] local characters = font_characters[n.font] if not characters then characters = fonts.hashes.identifiers[n.font].characters font_characters[n.font] = characters end local u = characters[n.char].tounicode return utf8.char(tonumber(u,16)) end elseif n.id == NODE_ID_DISC then if n.replace then return get_nodelist_text(n.replace) else return '' end else return '' end end % \end{macrocode} % \end{macro} % % % % \begin{macro}[int]{get_nodelist_text} % Return the text content of the glyph nodes in the list starting at \luavar{head} up to and including the node % \luavar{last}, or up to the end of the list if \luavar{last} is not specified. % \begin{macrocode} get_nodelist_text = function (head,last) local text = '' for item in node.traverse(head) do text = text .. get_node_text(item) if item == last then break end end return text end % \end{macrocode} % \end{macro} % % % % \begin{macro}[int]{is_possible_word_node} % Return boolean indicating if node \luavar{n} could be part of a word. Assume that \luavar{glyph}, \luavar{disc}, % and \luavar{margin_kern} nodes could be part of a word, as could a \luavar{kern} node with subtype % \luavar{fontkern}. % \begin{macrocode} local function is_possible_word_node(n) return ( n.id == NODE_ID_GLYPH or n.id == NODE_ID_DISC or (n.id == NODE_ID_KERN and n.subtype == NODE_KERN_SUBTYPE_FONTKERN) or n.id == NODE_ID_MARGIN_KERN ) end % \end{macrocode} % \end{macro} % % % % \begin{macro}[int]{is_possible_space_node} % Return boolean indicating if node \luavar{n} could be part of a space. Assume that \luavar{glue} nodes could be % part of a space, as could a \luavar{kern} node with subtype \luavar{userkern}. % \begin{macrocode} local function is_possible_space_node(n) return ( n.id == NODE_ID_GLUE or (n.id == NODE_ID_KERN and n.subtype == NODE_KERN_SUBTYPE_USERKERN) ) end % \end{macrocode} % \end{macro} % % % % \subsection{String manipulation} % % \begin{macro}[int]{trim_nonlettershyphens_both} % Remove characters other than letters and hyphens from both the start and end of a string. % \begin{macrocode} local function trim_nonlettershyphens_both(s) return unicode.utf8.match(s,'^[^%a-]*(.-)[^%a-]*$') end % \end{macrocode} % \end{macro} % % % % \begin{macro}[int]{trim_nonlettershyphens_start} % Remove characters other than letters and hyphens from the start of a string. % \begin{macrocode} local function trim_nonlettershyphens_start(s) return unicode.utf8.match(s,'^[^%a-]*(.-)$') end % \end{macrocode} % \end{macro} % % % % \begin{macro}[int]{trim_nonlettershyphens_end} % Remove characters other than letters and hyphens from the end of a string. % \begin{macrocode} local function trim_nonlettershyphens_end(s) return unicode.utf8.match(s,'^(.-)[^%a-]*$') end % \end{macrocode} % \end{macro} % % % % \begin{macro}[int]{rpad} % Return string \luavar{s} padded on the right with spaces to length \luavar{n}. % \begin{macrocode} local function rpad(s,n) return s .. unicode.utf8.rep(STR_SPACE,n - unicode.utf8.len(s)) end % \end{macrocode} % \end{macro} % % \begin{macro}[int]{lpad} % Return string \luavar{s} padded on the left with spaces to length \luavar{n}. % \begin{macrocode} local function lpad(s,n) return unicode.utf8.rep(STR_SPACE,n - unicode.utf8.len(s)) .. s end % \end{macrocode} % \end{macro} % % % % \subsection{Pre-linebreak processing} % % Before each line has been broken, find all potential division points and store the words in which they occur, % linking each potential break point to the corresponding word. % % Declare a new attribute, which will be used to store in each disc node the index of the corresponding word in the % table \luavar{hlist_segment_list}. % \begin{macrocode} local hyphen_attr = luatexbase.new_attribute('hyphen_attr') % \end{macrocode} % % % % Table to hold segments (word/space/math) in the hlist that will be broken. This table will be cleared after the % post-linebreak processing. % \begin{macrocode} local hlist_segment_list = {} % \end{macrocode} % % % % \begin{macro}[int]{get_first_glyph_lang} % Return the lang attribute of the first glyph in the the part of the list starting n that could be part of a word. % (Currently unused; see the documentation of \luafunc{get_disc_lang}.) % \begin{macrocode} -- local function get_first_glyph_lang(n) -- local item = n -- while item and is_possible_word_node(item) do -- if item.id == NODE_ID_GLYPH then -- return item.lang -- end -- item = item.next -- end -- return nil -- end % \end{macrocode} % \end{macro} % % % % \begin{macro}[int]{get_disc_lang} % Try to find the language ID in force at a given disc node by looking at (1)~the last glyph in the word % before the disc node; (2)~the first glyph in the word after the disc node. Default to language ID \luavar{0}. % % (Looking at \luavar{replace}, \luavar{pre}, \luavar{post} is possible, but is unreliable and so disabled for the % present. The author has encountered the situation where an explicit hyphen results in the hyphen characters in % \luavar{replace} and \luavar{pre} having different language IDs. He has not had time to investigate how this % arises from the interaction of \pkg{babel}/\pkg{polyglossia} and Lua\LaTeX.) % \begin{macrocode} local function get_disc_lang(n) -- lang = get_first_glyph_lang(n.replace) -- if lang then -- print(lang) -- return lang -- end -- lang = get_first_glyph_lang(n.pre) -- if lang then -- print(lang) -- return lang -- end -- lang = get_first_glyph_lang(n.post) -- if lang then -- return lang -- end local item % \end{macrocode} % Before the disc node. % \begin{macrocode} item = n while item and is_possible_word_node(item) do if item.id == NODE_ID_GLYPH then return item.lang end item = item.prev end % \end{macrocode} % After the disc node. % \begin{macrocode} item = n while item and is_possible_word_node(item) do if item.id == NODE_ID_GLYPH then return item.lang end item = item.next end return 0 end % \end{macrocode} % \end{macro} % % % % \begin{macro}[int]{pre_linebreak} % Extract segments (word/space/math) from the hlist at \luavar{hlist_head} and store appropriate data in % \luavar{hlist_segment_list}. For spaces and math, this is just the existence of a segment. For a word, store its % text and its language ID (as determined by \luafunc{get_disc_lang}). Also, for each disc node, assign the index % of the word in \luavar{hlist_segment_list} to its \luavar{hyphen_attr} attribute (declared above). % \begin{macrocode} local function pre_linebreak(hlist_head,groupcode) local word_start_node = nil local segment_count = 0 local lang = nil debug('Pre-linebreak processing start') local item = hlist_head while item do % \end{macrocode} % If \luavar{item} is a math node (which must have subtype beginmath, unless something has changed the node list), % skip the math and add "[MATH]" to \luavar{hlist_segment_list}. % \begin{macrocode} if item.id == NODE_ID_MATH then assert(item.subtype == NODE_MATH_SUBTYPE_BEGIN) while not ( item.id == NODE_ID_MATH and item.subtype == NODE_MATH_SUBTYPE_END ) do item = item.next end item = item.next segment_count = segment_count + 1 hlist_segment_list[segment_count] = { [KEY_TYPE] = SEGMENT_MATH } goto continue end % \end{macrocode} % If \luavar{item} is a possible word node, read the whole word, setting the \luavar{hyphen_attr} of any disc nodes % to \luavar{segment_count}, and adding the word to \luavar{hlist_segment_list}. % \begin{macrocode} if is_possible_word_node(item) then word_start_node = item segment_count = segment_count + 1 while item and is_possible_word_node(item) do % \end{macrocode} % When the first disc node is found, find the language of the word. % \begin{macrocode} if item.id == NODE_ID_DISC then if not lang then lang = get_disc_lang(item) end node.set_attribute(item,hyphen_attr,segment_count) end item = item.next end % \end{macrocode} % \luavar{item} should be a node, because even after the last word node, the hlist will contain something. But just % in case, check and find the last node using \luafunc{node.tail} if necessary. This latter case should be very % rare, so it is more efficient to recalculate here if necessary rather than having an extra assignment to store the % previous node in the while loop. % \begin{macrocode} local word_end_node if item then word_end_node = item.prev else word_end_node = node.tail(word_start_node) end local word = get_nodelist_text(word_start_node,word_end_node) hlist_segment_list[segment_count] = { [KEY_TYPE] = SEGMENT_WORD, [KEY_WORD] = word, [KEY_LANG] = lang, } word_start_node = nil lang = nil goto continue end % \end{macrocode} % If \luavar{item} is a node that could be part of a space, add a space to the segment list. % \begin{macrocode} if is_possible_space_node(item) then segment_count = segment_count + 1 while item and is_possible_space_node(item) do item = item.next end hlist_segment_list[segment_count] = { [KEY_TYPE] = SEGMENT_SPACE } goto continue end % \end{macrocode} % If \luavar{item} is anything else, just move on. % \begin{macrocode} item = item.next ::continue:: end debug('Pre-linebreak processing finish') return true end % \end{macrocode} % \end{macro} % % % % \subsection{Post-linebeak processing} % % After linebreaking, look for a discretionary node at the end of each line, which indicates that a word has been % divided between the end of that line and the start of the next. Extract the two word-pieces from the lines and store % them, together with the undivided word and its context in the appropriate language table. Also insert a whatsit to % that will set the page number when the hyphenation is written out. % % \begin{macro}[int]{get_used_disc} % If at the tail of the hlist at \luavar{hlist_head} (which will be a line) there is a disc node not followed by a % glyph node, return that disc node. Otherwise return \luavar{nil}. % \begin{macrocode} local function get_used_disc(hlist_head) local item = node.tail(hlist_head) while item and item.id ~= NODE_ID_GLYPH do if item.id == NODE_ID_DISC then return item end item = item.prev end return nil end % \end{macrocode} % \end{macro} % % % % \begin{macro}[int]{get_disc_word_start} % Return the node starting the word that includes a given disc node \luavar{n}, or \luavar{nil} if there is no such % node. % \begin{macrocode} local function get_disc_word_start(hlist_head,n) local item = n while item do local prev = item.prev if not (prev and is_possible_word_node(prev)) then return item end item = prev end return nil end % \end{macrocode} % \end{macro} % % % % \begin{macro}[int]{get_next_hlist} % Return the next hlist in the list containing the given node \luavar{n}, or \luavar{nil} if there is no such hlist % node. % \begin{macrocode} local function get_next_hlist(n) local item = n.next while item do if item.id == NODE_ID_HLIST then return item end item = item.next end return nil end % \end{macrocode} % \end{macro} % % % % \begin{macro}[int]{get_line_first_word} % Return the first word in the hlist at \luavar{hlist_head}, or \luavar{nil} if there is no such word. % \begin{macrocode} local function get_line_first_word(hlist_head) % \end{macrocode} % \luavar{word_start_node} is either \luavar{nil} or the (glyph) node that starts the word. % \begin{macrocode} local word_start_node = nil for item in node.traverse(hlist_head) do if item.id == NODE_ID_GLYPH then if not word_start_node then word_start_node = item end end if not is_possible_word_node(item) then if word_start_node then return get_nodelist_text(word_start_node,item.prev) end end end % \end{macrocode} % It is possible that the word ends at the end of the hlist, so check if a word has been started. % \begin{macrocode} if word_start_node then return get_nodelist_text(word_start_node,node.tail(hlist_head)) else return nil end end % \end{macrocode} % \end{macro} % % % % \begin{macro}[int]{get_context} % Return a string assembled from the part of \luavar{hlist_segment_list} before or after \luavar{index} according to % \luavar{incr} (which must be \(\pm 1\)) up a maximum of \luavar{target_word_count} words. % \begin{macrocode} local function get_context(index,incr,target_word_count) local result = '' local word_count = 0 local i = index + incr while ( i > 0 and i <= #hlist_segment_list and word_count < target_word_count ) do local t = hlist_segment_list[i] local item if t[KEY_TYPE] == SEGMENT_WORD then item = t[KEY_WORD] word_count = word_count + 1 elseif t[KEY_TYPE] == SEGMENT_SPACE then item = STR_SPACE elseif t[KEY_TYPE] == SEGMENT_MATH then item = STR_MATH end if incr > 0 then result = result .. item else result = item .. result end i = i + incr end return result end % \end{macrocode} % \end{macro} % % % % Count and list for hyphenated words. Each entry in the list will be a table containing the original word, the % hyphenation, the language, the index of the table in the list (which is needed later for stable sorting and sorting % into the original order), and the context. % \begin{macrocode} local hyphenation_list = {} local hyphenation_count = 0 % \end{macrocode} % % % % \begin{macro}[int]{check_line_hyphenation} % Check whether there is a hyphenated word at the end of the given hlist; if so, save the word to % \luavar{hyphenation_list}. % \begin{macrocode} local function check_line_hyphenation(hlist) % \end{macrocode} % First, is there a disc node not followed by a glyph node at the end of the list? % \begin{macrocode} local last_disc = get_used_disc(hlist.head) if not last_disc then debug(' No disc node found at end of line') return end % \end{macrocode} % Get the undivided word and its language from \luavar{hlist_segment_list}. % \begin{macrocode} local hyphenation_index = node.has_attribute(last_disc,hyphen_attr) local t = hlist_segment_list[hyphenation_index] assert(t) assert(t[KEY_TYPE] == SEGMENT_WORD) local word = t[KEY_WORD] local lang = t[KEY_LANG] % \end{macrocode} % \luavar{word} might be something other than a genuine word, such as an ISBN (with hyphen separators). So only % proceed if it contains at least one letter. % \begin{macrocode} if not unicode.utf8.match(word,'%a') then debug(' Divided "word" contains no letters') return end % \end{macrocode} % There should always be a next line, since there is a disc node at the end of \luavar{hlist}, but check anyway. % \begin{macrocode} local next_line = get_next_hlist(hlist) if not next_line then debug(' No following line found (which should not happen)') return end % \end{macrocode} % For the pre-linebreak part of the word, get the word that ends the line, and trim any leading non-letters. This % could leave an empty word; for example, if \(n\)-dimensional is broken at the hyphen, the word ending the line is % just the hyphen. If an empty word is left, just use the non-trimmed result. % \begin{macrocode} local pre = get_nodelist_text(get_disc_word_start(hlist.head,last_disc)) local pre_temp = trim_nonlettershyphens_start(pre) if pre_temp ~= '' then pre = pre_temp end % \end{macrocode} % For the post-linebreak part, just get the word at the start of the next line, and trim and trailing non-letters. % \begin{macrocode} local post = trim_nonlettershyphens_end(get_line_first_word(next_line.head)) % \end{macrocode} % Compute the context and then trim any unwanted symbols from the word itself. % \begin{macrocode} local context = get_context( hyphenation_index,-1,tex.count['l__lualisthyphen_context_before_int'] ) .. word .. get_context( hyphenation_index,1,tex.count['l__lualisthyphen_context_after_int'] ) word = trim_nonlettershyphens_both(word) debug( ' Hyphenated word found: "' .. word .. '" -> "' .. pre .. '<>' .. post .. '"' ) % \end{macrocode} % Store everything (except the page number on which the hyphenated word appears, which is not yet known) in the % hyphenation list. % \begin{macrocode} hyphenation_count = hyphenation_count + 1 hyphenation_list[hyphenation_count] = { [KEY_LANG] = lang, [KEY_WORD] = word, [KEY_DIVISION] = pre .. post, [KEY_INDEX] = hyphenation_count, [KEY_CONTEXT] = context, } % \end{macrocode} % Add a whatsit to record the page number when the page with the hyphenation is shipped out. This information also % serves to distinguish hyphenations that are written to the page from those that occur in (e.g.) boxes that are % discarded without being written to the page. % \begin{macrocode} late_lua_n = node.new('whatsit','late_lua') late_lua_n.data = 'lualisthyphen.set_hyphenation_page(' .. hyphenation_count .. ',tex.count["c@page"])' node.insert_after(hlist.head,last_disc,late_lua_n) end % \end{macrocode} % \end{macro} % % % % \begin{macro}[int]{set_hyphenation_page} % Set the page on which the hyphenation with the given index appears. % \begin{macrocode} local function set_hyphenation_page(index,page) hyphenation_list[index][KEY_PAGE] = page end % \end{macrocode} % \end{macro} % % % % \begin{macro}[int]{post_linebreak} % For every line in the vlist at \luavar{vlist_head}, check whether there is a hyphenated word at the end. % \begin{macrocode} local function post_linebreak(vlist_head,groupcode) debug('Post-linebreak processing start') local line_no = 0 for item in node.traverse(vlist_head) do if item.id == NODE_ID_HLIST then line_no = line_no + 1 debug(' Line no.' .. line_no) check_line_hyphenation(item) end end hlist_segment_list = {} debug('Post-linebreak processing end') return true end % \end{macrocode} % \end{macro} % % % % \subsection{Callbacks} % % Add \luafunc{pre_linebreak} and \luafunc{post_linebreak} to the relevant callbacks. % \begin{macrocode} local LUA_LIST_HYPHEN_PRE_LINEBREAK = 'LUA_LIST_HYPHEN_PRE_LINEBREAK' luatexbase.add_to_callback( 'pre_linebreak_filter', pre_linebreak, LUA_LIST_HYPHEN_PRE_LINEBREAK ) local LUA_LIST_HYPHEN_POST_LINEBREAK = 'LUA_LIST_HYPHEN_POST_LINEBREAK' luatexbase.add_to_callback( 'post_linebreak_filter', post_linebreak, LUA_LIST_HYPHEN_POST_LINEBREAK ) % \end{macrocode} % % % % \subsection{Language settings} % % Table mapping language IDs to textual names. % \begin{macrocode} local language_table = {} % \end{macrocode} % % Populating \luavar{language_table} is done differently for \pkg{babel} and \pkg{polyglossia}. If \pkg{babel} is in % use, the \LaTeX\ frontend iterates through \cs{bbl@languages} and calls \luafunc{babel_save_language_name}. If % \pkg{polyglossia} is in use, \luavar{language_table} is populated by \luafunc{polyglossia_get_language_names}, which % is called just before the hyphenation lists are written. % % \begin{macro}[int]{babel_save_language_name} % Store the association of a language ID to \pkg{babel}'s texual name, if no name has been assigned to that ID % already. % \begin{macrocode} local function babel_save_language_name(lang_id,name) if not language_table[lang_id] then language_table[lang_id] = name end end % \end{macrocode} % \end{macro} % % % % \begin{macro}[int]{polyglossia_get_language_names} % If polyglossia has been loaded, use it to build the table mapping language IDs to textual names. % \begin{macrocode} local function polyglossia_get_language_names() if not polyglossia then return end for name,language in pairs(polyglossia.newloader_loaded_languages) do language_table[lang.id(language)] = name end end % \end{macrocode} % \end{macro} % % % % \subsection{Processing hyphenation lists} % % Before writing out hyphenation lists, remove duplicates and/or perform sorting, in accordance with the set options. % % % % \subsubsection{Comparisons and equality checks} % % \begin{macro}[int]{equal_hyphenation_case_sensitive} % Equality check for deduplicating the list of hyphenations case-sensitively. % \begin{macrocode} local function equal_hyphenation_case_sensitive(a,b) return ( a[KEY_WORD] == b[KEY_WORD] and a[KEY_DIVISION] == b[KEY_DIVISION] ) end % \end{macrocode} % \end{macro} % % % % \begin{macro}[int]{equal_hyphenation_case_insensitive} % Equality check for deduplicating the list of hyphenations case-insensitively. % \begin{macrocode} local function equal_hyphenation_case_insensitive(a,b) return ( unicode.utf8.lower(a[KEY_WORD]) == unicode.utf8.lower(b[KEY_WORD]) and unicode.utf8.lower(a[KEY_DIVISION]) == unicode.utf8.lower(b[KEY_DIVISION]) ) end % \end{macrocode} % \end{macro} % % % % \begin{macro}[int]{lessthan_hyphenation_case_sensitive} % Comparison for sorting the list of hyphenations case-sensitively. % % The comparison of index keys ensures that the sorting is stable. % \begin{macrocode} local function lessthan_hyphenation_case_sensitive(a,b) return ( a[KEY_WORD] < b[KEY_WORD] or ( a[KEY_WORD] == b[KEY_WORD] and a[KEY_DIVISION] < b[KEY_DIVISION] ) or ( a[KEY_WORD] == b[KEY_WORD] and a[KEY_DIVISION] == b[KEY_DIVISION] and a[KEY_INDEX] < b[KEY_INDEX] ) ) end % \end{macrocode} % \end{macro} % % % % \begin{macro}[int]{lessthan_hyphenation_case_insensitive} % Comparison for sorting the list of hyphenations case-insensitively. % % The comparison of index keys ensures that the sorting is stable. % \begin{macrocode} local function lessthan_hyphenation_case_insensitive(a,b) return ( unicode.utf8.lower(a[KEY_WORD]) < unicode.utf8.lower(b[KEY_WORD]) or ( unicode.utf8.lower(a[KEY_WORD]) == unicode.utf8.lower(b[KEY_WORD]) and unicode.utf8.lower(a[KEY_DIVISION]) < unicode.utf8.lower(b[KEY_DIVISION]) ) or ( unicode.utf8.lower(a[KEY_WORD]) == unicode.utf8.lower(b[KEY_WORD]) and unicode.utf8.lower(a[KEY_DIVISION]) < unicode.utf8.lower(b[KEY_DIVISION]) and a[KEY_INDEX] < b[KEY_INDEX] ) ) end % \end{macrocode} % \end{macro} % % % % \subsubsection{Sorting} % % \begin{macro}[int]{sort_hyphenation_list_none} % Sort \luavar{hyphenation_list} into its original order of appearance. % \begin{macrocode} local function sort_hyphenation_list_none(hyphenation_list) table.sort( hyphenation_list, function(a,b) return a[KEY_INDEX] < b[KEY_INDEX] end ) end % \end{macrocode} % \end{macro} % % % % \begin{macro}[int]{sort_hyphenation_list_case} % Sort \luavar{hyphenation_list} case-sensitively. % \begin{macrocode} local function sort_hyphenation_list_case(hyphenation_list) table.sort( hyphenation_list, lessthan_hyphenation_case_sensitive ) end % \end{macrocode} % \end{macro} % % % % \begin{macro}[int]{sort_hyphenation_list_nocase} % Sort \luavar{hyphenation_list} case-insensitively. % \begin{macrocode} local function sort_hyphenation_list_nocase(hyphenation_list) table.sort( hyphenation_list, lessthan_hyphenation_case_insensitive ) end % \end{macrocode} % \end{macro} % % % % \begin{macro}[int]{process_lang_hyphenation_list_sort} % Select the appropriate function for sorting. % \begin{macrocode} local sort_hyphenation_list if tex.count['l__lualisthyphen_sort_int'] == 1 then sort_hyphenation_list = sort_hyphenation_list_case elseif tex.count['l__lualisthyphen_sort_int'] == 2 then sort_hyphenation_list = sort_hyphenation_list_nocase else sort_hyphenation_list = sort_hyphenation_list_none end % \end{macrocode} % \end{macro} % % % % \subsubsection{Deduplication} % % \begin{macro}[int]{deduplicate_hyphenation_list_none} % Dummy function; does not deduplicate \luavar{hyphenation_list}. % \begin{macrocode} local function deduplicate_hyphenation_list_none(hyphenation_list) end % \end{macrocode} % \end{macro} % % % % \begin{macro}[int]{deduplicate_hyphenation_list_case} % Remove duplicates from \luavar{hyphenation_list} case-sensitively. % \begin{macrocode} local function deduplicate_hyphenation_list_case(hyphenation_list) table.sort( hyphenation_list, lessthan_hyphenation_case_sensitive ) list_uniq( hyphenation_list, equal_hyphenation_case_sensitive ) end % \end{macrocode} % \end{macro} % % % % \begin{macro}[int]{deduplicate_hyphenation_list_nocase} % Remove duplicates from \luavar{hyphenation_list} case-insensitively. % \begin{macrocode} local function deduplicate_hyphenation_list_nocase(hyphenation_list) table.sort( hyphenation_list, lessthan_hyphenation_case_insensitive ) list_uniq( hyphenation_list, equal_hyphenation_case_insensitive ) end % \end{macrocode} % \end{macro} % % % % \begin{macro}[int]{deduplicate_hyphenation_list} % Select the appropriate function for whether duplicates whould be removed. % \begin{macrocode} local deduplicate_hyphenation_list if tex.count['l__lualisthyphen_unique_int'] == 1 then deduplicate_hyphenation_list = deduplicate_hyphenation_list_case elseif tex.count['l__lualisthyphen_unique_int'] == 2 then deduplicate_hyphenation_list = deduplicate_hyphenation_list_nocase else deduplicate_hyphenation_list = deduplicate_hyphenation_list_none end % \end{macrocode} % \end{macro} % % % % \subsubsection{Combined processing} % % \begin{macro}[int]{process_lang_hyphenation_list} % Remove duplicates and sort \luavar{hyphenation_list}. % \begin{macrocode} local function process_lang_hyphenation_list(hyphenation_list) deduplicate_hyphenation_list(hyphenation_list) sort_hyphenation_list(hyphenation_list) end % \end{macrocode} % \end{macro} % % % % \subsection{Writing} % % \begin{macro}[int]{write_lang_hyphenation_list_standard} % Write out just the hyphenated words in \luavar{hyphenation_list} to file handle \luavar{f}. % \begin{macrocode} local function write_lang_hyphenation_list_standard(f,hyphenation_list,widths) for i,v in ipairs(hyphenation_list) do if v then f:write(v[KEY_DIVISION] .. '\n') end end end % \end{macrocode} % \end{macro} % % % % \begin{macro}[int]{write_lang_hyphenation_list_verbose} % Write out all hyphenation information in \luavar{hyphenation_list} to file handle \luavar{f}, in columns as % specified in \luavar{widths}. % \begin{macrocode} local function write_lang_hyphenation_list_verbose(f,hyphenation_list,widths) local cols_word = widths[KEY_WORD] local cols_division = widths[KEY_DIVISION] local cols_page = widths[KEY_PAGE] for i,v in ipairs(hyphenation_list) do if v then % \end{macrocode} % It is possible for KEY_PAGE not to have been set, for instance if the hyphenation occured in a box that was never % output. % \begin{macrocode} local page = v[KEY_PAGE] if page then page = STR_PAGE_PREFIX .. page else page = STR_PAGE_NONE end f:write( rpad(v[KEY_WORD],cols_word) .. STR_ARROW .. rpad(v[KEY_DIVISION],cols_division) .. STR_SPACE_TWO .. lpad(page,cols_page) .. STR_SPACE .. STR_QUOTE_OPEN .. v[KEY_CONTEXT] .. STR_QUOTE_CLOSE .. '\n' ) end end end % \end{macrocode} % \end{macro} % % % % \begin{macro}[int]{write_lang_hyphenation_list} % Set \luafunc{write_lang_hyphenation_list} to be either \luafunc{write_lang_hyphenation_list_standard} or % \luafunc{write_lang_hyphenation_list_verbose}, depending on the % package options. % \begin{macrocode} local write_lang_hyphenation_list if tex.count['l__lualisthyphen_verbose_int'] == 0 then write_lang_hyphenation_list = write_lang_hyphenation_list_standard else write_lang_hyphenation_list = write_lang_hyphenation_list_verbose end % \end{macrocode} % \end{macro} % % % % Compute a settings description to insert into file headers. % \begin{macrocode} local settings_desc if tex.count['l__lualisthyphen_verbose_int'] == 0 then settings_desc = 'verbose=false' else settings_desc = 'verbose=true' .. ',context-before=' .. tex.count['l__lualisthyphen_context_before_int'] .. ',context-after=' .. tex.count['l__lualisthyphen_context_after_int'] end if tex.count['l__lualisthyphen_include_non_output_int'] == 0 then settings_desc = settings_desc .. ',include-non-output=false' else settings_desc = settings_desc .. ',include-non-output=true' end local NONE_CASE_NOCASE = { [0] = 'none', [1] = 'case', [2] = 'nocase' } settings_desc = settings_desc .. ',sort=' .. NONE_CASE_NOCASE[tex.count['l__lualisthyphen_sort_int']] .. ',unique=' .. NONE_CASE_NOCASE[tex.count['l__lualisthyphen_unique_int']] % \end{macrocode} % % % % \begin{macro}[int]{get_hyphenation_file_path} % Get the file to which the list of hyphenated words will be written, based on the given \luavar{prefix}, % \luavar{extension}, \luavar{lang_name}, and taking into account any specified output directory for % Lua\TeX, and with a file header. % \begin{macrocode} local function get_hyphenation_file_path(prefix,extension,lang_name) local hyphenation_file_path = prefix .. tostring(lang_name) .. extension if not status.output_directory then return hyphenation_file_path end if string.sub(status.output_directory,-1,-1) == '/' then hyphenation_file_path = status.output_directory .. hyphenation_file_path else hyphenation_file_path = status.output_directory .. '/' .. hyphenation_file_path end return hyphenation_file_path end % \end{macrocode} % \end{macro} % % % % \begin{macro}[int]{process_write_lang_hyphenation_list} % Process and write out the \luavar{hyphenation_list} (which will be for the language with the numerical % \luavar{lang_id}) to a file with the given \luavar{prefix} and \luavar{extension}, using \luavar{widths} for the % ‘columns’ in verbose mode. % \begin{macrocode} local function process_write_lang_hyphenation_list( prefix,extension,lang_id,hyphenation_list,widths ) process_lang_hyphenation_list(hyphenation_list) local lang_name = language_table[lang_id] local lang_desc if not lang_name then lang_name = lang_id lang_desc = 'language with ID ' .. lang_id else lang_desc = 'language "' .. lang_name .. '" (ID ' .. lang_id .. ')' end local f = io.open(get_hyphenation_file_path(prefix,extension,lang_name),'w') f:write('% Chosen hyphenations for ' .. lang_desc .. '\n') f:write('% Generated by lua-list-hyphen (' .. settings_desc .. ')\n') write_lang_hyphenation_list(f,hyphenation_list,widths) f:close() end % \end{macrocode} % \end{macro} % % % % \begin{macro}[int]{process_write_hyphenation_lists} % Sort \luavar{hyphenation_list} into per-language lists and write them out to separate files. % \begin{macrocode} local function process_write_hyphenation_lists(prefix,extension) local lang_hyphenation_table = {} local lang_widths_table = {} % \end{macrocode} % Iterate through all the stored hyphenations. Sort them into per-language lists (creating the list the first time % each language is encountered) and also storing the maximum width of values, for output alignment. % \begin{macrocode} for _,h in pairs(hyphenation_list) do if h[KEY_PAGE] or tex.count['l__lualisthyphen_include_non_output_int'] == 1 then local lang = h[KEY_LANG] local t = lang_hyphenation_table[lang] if not t then lang_hyphenation_table[lang] = {} t = lang_hyphenation_table[lang] end local widths = lang_widths_table[lang] if not widths then lang_widths_table[lang] = { [KEY_WORD] = 0, [KEY_DIVISION] = 0, [KEY_PAGE] = 0 } widths = lang_widths_table[lang] end widths[KEY_WORD] = math.max( widths[KEY_WORD], unicode.utf8.len(h[KEY_WORD]) ) widths[KEY_DIVISION] = math.max( widths[KEY_DIVISION], unicode.utf8.len(h[KEY_DIVISION]) ) widths[KEY_PAGE] = math.max( widths[KEY_PAGE], unicode.utf8.len(tostring(h[KEY_PAGE])) ) table.insert(t,h) end end % \end{macrocode} % Adjust the maximum width for the page output, since there is a prefix and a ‘no page’ indicator to consider. % \begin{macrocode} for _,widths in pairs(lang_widths_table) do widths[KEY_PAGE] = math.max( widths[KEY_PAGE] + unicode.utf8.len(STR_PAGE_PREFIX), unicode.utf8.len(STR_PAGE_NONE) ) end % \end{macrocode} % If polyglossia is in use, populate \luavar{language_table}. % \begin{macrocode} polyglossia_get_language_names() % \end{macrocode} % For each language, process and write out its hyphenations to a file. % \begin{macrocode} for k,v in pairs(lang_hyphenation_table) do process_write_lang_hyphenation_list(prefix,extension,k,v,lang_widths_table[k]) end end % \end{macrocode} % \end{macro} % % % % \subsection{Export public functions} % % Finally, make available the functions that will be called from the \LaTeX\ frontend using \cs{lua_now:n}. % \begin{macrocode} return { process_write_hyphenation_lists = process_write_hyphenation_lists, set_hyphenation_page = set_hyphenation_page, babel_save_language_name = babel_save_language_name, } % \end{macrocode} % % % % \begin{macrocode} % % \end{macrocode} % % % % \clearpage % \end{implementation} % % % % \iffalse %<*metadriver> \input{lua-list-hyphen.dtx} % % \fi