#!/usr/bin/env texlua
local show_pdf_tags_version = "1.1"
kpse.set_program_name'lualatex'
local mypath = string.match(debug.getinfo(1, 'S').source, '@(.*)[/\\][^/\\]+')
if mypath then
package.path = mypath .. '/?.lua;' .. package.path
end
local out_format = "tree"
local follow_rolemap = false
local hide_w3c = false
local pdfe = pdfe or require'pdfe'
local process_stream = require'show-pdf-tags_process_stream'
local text_string_to_utf8 = require'show-pdf-tags_decode'.text_string_to_utf8
local function ordered_pairs(t)
local keys = {}
local n = 0
for k in pairs(t) do
n = n + 1
keys[n] = k
end
table.sort(keys)
local i = 0
return function()
i = i + 1
local key = keys[i]
if key == nil then
return
end
return key, t[key]
end
end
local function almost_resolve(t, v, i)
local id
while t == 10 do
id = i
t, v, i = pdfe.getfromreference(v)
end
return id, t, v, i
end
local function get_page(elem)
local id, t, v, i = almost_resolve(pdfe.getfromdictionary(elem, 'Pg'))
if not t or t < 2 then return end
assert(id and t == 8, 'page should be a dictionary')
return id
end
local convert_kids
local function with_warnings(result)
return function(ctx, warning_generator)
local warnings = {}
warning_generator(function(condition, warning)
if condition then
warnings[#warnings + 1] = warning
end
end, result)
ctx.warnings[result] = warnings[1] and warnings
return result
end
end
local warnings_key = {} -- Just a marker, always empty
local function convert_mc(ctx, mcid, page, stream_id, stream, owner)
local stream_or_page_id = stream_id or page
local pageno = ctx.pagenos[page]
local stream_data = ctx.streams[stream_or_page_id]
if stream_or_page_id and not ctx.streams[stream_or_page_id] then
local warnings
stream_data, warnings = process_stream(
stream or ctx.document.Pages[ctx.pagenos[page]].Contents,
stream and stream.Resources or ctx.document.Pages[ctx.pagenos[page]].Resources
)
stream_data[warnings_key] = warnings
ctx.streams[stream_or_page_id] = stream_data
end
return with_warnings {
type = 'MCR',
page = pageno,
stream = stream,
owner = owner,
content = stream_data[mcid],
} (ctx, function(warn)
warn(not page, 'Missing page reference in marked content reference')
warn(page and not pageno, 'Page referenced in marked content referene does not exist')
warn(not stream_data, 'No stream referenced in marked content reference')
warn(stream_data and not stream_data[mcid], 'Referenced marked content sequence not found')
local warnings = stream_data and stream_data[warnings_key][mcid]
if warnings then
for _, warning in ipairs(warnings) do
warn(true, warning)
end
end
end)
end
local function convert_objr(ctx, obj, page)
local id, _, obj = assert(almost_resolve(pdfe.getfromdictionary(obj, 'Obj')))
return {
type = 'OBJR',
page = ctx.pagenos[page],--assert(ctx.pagenos[page]), -- TODO: assert(...) once tagpdf is adapted
ObjId = id,
Obj = obj,
}
end
local default_namespace = 'http://iso.org/pdf/ssn'
-- this prefix to be confirmed, another possibility would be data:,
local owner_prefix = 'http://iso.org/pdf/ssn/'
local function get_string(container, index, warnings)
local text = pdfe.getstring(container, index, true)
if text then
local u = text_string_to_utf8:match(text)
if u == nil then
warnings[#warnings + 1] = warning
return "??"
end
return u
else
return
end
end
local function pdf2lua(container, index, t, v, x, warnings)
local saved = {}
local function recurse(container, index, t, v, x)
if t == 10 then
local id
id, t, v, x = almost_resolve(t, v, x)
local result = saved[id]
if result == nil then
result = recurse(container, index, t, v, x)
saved[id] = result
end
return result
end
if not t or t < 2 then
return
elseif t < 6 then
return v
elseif t == 6 then
return get_string(container, index, warnings)
elseif t == 7 then
local arr = {}
for i=1, #v do
arr[i] = recurse(v, i, pdfe.getfromarray(v, i))
end
return arr
elseif t == 8 then
local dict = {}
for i=1, #v do
local k, inner_t, inner_v, detail = pdfe.getfromdictionary(v, i)
dict[k] = recurse(v, k, inner_t, inner_v, detail)
end
return dict
else
assert(false, 'Streams are not handled at the moment')
end
end
return recurse(container, index, t, v, x), warnings[0] and warnings
end
local function convert_attributes(ctx, attrs, classes)
if not classes and not attrs then return end
local attributes = {}
local function apply_attr(attr)
local owner = assert(attr.O)
if owner == 'NSO' then
-- avoid error if no-namespace attributes to be modelled by missing NS field
owner = (attr.NS and attr.NS.NS) or ""
else
owner = owner_prefix .. owner
end
local owner_dict = attributes[owner]
if not owner_dict then
owner_dict = {}
attributes[owner] = owner_dict
end
for i = 1, #attr do
local key, t, v, extra = pdfe.getfromdictionary(attr, i)
if key ~= 'O' and key ~= 'NS' then
local warnings = ctx.warnings[false] or {}
owner_dict[key] = pdf2lua(attr, key, t, v, extra, warnings)
if warnings[1] then
ctx.warnings[false] = warnings
end
end
end
end
local function apply_attrs(attrs)
if attrs == nil then return end
local t = pdfe.type(attrs)
if t == 'pdfe.dictionary' then
apply_attr(attrs)
else
assert(t == 'pdfe.array')
for i=1, #attrs do
local attr = attrs[i]
if type(attr) ~= 'number' then
apply_attr(attr)
end
end
end
end
if classes then
if type(classes) == 'string' then
apply_attrs(ctx.ClassMap[classes])
else
for i=1, #classes do
local class = classes[i]
if type(class) ~= 'number' then
apply_attrs(ctx.ClassMap[classes[i]])
end
end
end
end
if attrs then
apply_attrs(attrs)
end
return attributes
end
local function convert(ctx, elem, id, page)
if type(elem) == 'number' then
return convert_mc(ctx, elem, page)
elseif elem.Type == 'MCR' then
local stm_id, _, stm = almost_resolve(pdfe.getfromdictionary(elem, 'Stm'))
return convert_mc(ctx, elem.MCID, get_page(elem) or page, stm_id, stm, elem.StmOwn)
elseif elem.Type == 'OBJR' then
return convert_objr(ctx, elem, get_page(elem) or page)
end
local ns = elem.NS
local role_mapped_s, role_mapped_ns
ns = ns and ns.NS or default_namespace
local warnings = {}
local obj = {
subtype = ctx.type_maps[elem.NS and tostring(elem.NS) or false][elem.S],
attributes = convert_attributes(ctx, elem.A, elem.C),
title = get_string(elem, 'T', warnings),
lang = get_string(elem, 'Lang', warnings),
alt = get_string(elem, 'Alt', warnings),
expanded = get_string(elem, 'E', warnings),
actual_text = get_string(elem, 'ActualText', warnings),
associated_files = elem.AF,
id = get_string(elem, 'ID', warnings),
phoneme = get_string(elem, 'Phoneme', warnings),
phonetic_alphabet = get_string(elem, 'PhoneticAlphabet', warnings),
kids = convert_kids(ctx, elem),
}
if warnings[1] then
ctx.warnings[obj] = warnings
end
ctx.id_map[id] = obj
local elem_ref = elem.Ref
if elem_ref and #elem_ref > 0 then
local ref = {}
for i = 1, #elem_ref do
ref[i] = assert(almost_resolve(pdfe.getfromarray(elem_ref, i)))
end
obj.ref = ref
ctx.ref_entries[#ctx.ref_entries + 1] = obj
end
return obj
end
function convert_kids(ctx, elem)
local id, t, k = almost_resolve(pdfe.getfromdictionary(elem, 'K'))
if not k then return nil end
local page = get_page(elem)
if t == 7 then
local result = {}
for i = 1, #k do
local id, t, kid = almost_resolve(pdfe.getfromarray(k, i))
result[i] = convert(ctx, k[i], id, page)
end
return result
else
return {convert(ctx, k, id, page)}
end
end
local function role_map_to_lua(role_map)
if not role_map then return {} end
local lua_role_map = pdfe.dictionarytotable(role_map)
for k, mapping in pairs(lua_role_map) do
if mapping[1] == 5 then -- name
mapping[1], mapping[2] = mapping[2], false
elseif mapping[1] == 7 then -- array
if mapping[3] == 2 then
mapping[1], mapping[2], mapping[3] = mapping[2][1], mapping[2][2]
else
io.stderr:write"Ignoring entry with invalid length in rolemap\n"
end
else
io.stderr:write"Ignoring invalid rolemap entry\n"
end
end
return lua_role_map
end
local function open(filename)
local document = pdfe.open(filename)
if 0 < (pdfe.getstatus(document) or 2) then
return nil, 'Failed to open document'
end
local ctx = {
document = document,
streams = {},
}
local catalog = pdfe.getcatalog(document)
local markinfo = catalog and catalog.MarkInfo
local tagged = markinfo and markinfo.Marked
if not tagged then
io.stderr:write("Document catalog has no markinfo.Marked entry. It might not be tagged.\n")
end
local pagenos = {}
for i, page in ipairs(pdfe.pagestotable(document)) do
pagenos[page[3]] = i
end
ctx.pagenos = pagenos
local id_map = {}
ctx.id_map = id_map
ctx.ref_entries = {}
local structroot = catalog.StructTreeRoot
if not structroot then
return {}, ctx
end
local type_maps = {}
do
local namespaces = structroot.Namespaces
for i=0, namespaces and #namespaces or 0 do
local ns, ns_key, role_map
if i == 0 then
ns, ns_key = false, false
role_map = structroot.RoleMap
else
local namespace = namespaces[i]
ns = namespace.NS
ns_key = tostring(namespace)
role_map = namespace.RoleMapNS
end
role_map = role_map_to_lua(role_map)
type_maps[ns_key] = setmetatable({}, {__index = function(t, elem)
local element = {subtype = elem, namespace = ns}
t[elem] = element
local mapped = role_map[elem]
if mapped then
element.mapped = type_maps[mapped[2] and tostring(mapped[2])][mapped[1]]
end
return element
end})
end
end
ctx.type_maps = type_maps
ctx.ClassMap = structroot.ClassMap
ctx.warnings = setmetatable({}, {__mode = 'k'})
local elements = convert_kids(ctx, structroot)
ctx.ClassMap = nil
for _, obj in ipairs(ctx.ref_entries) do
local refs = obj.ref
for i, ref in ipairs(refs) do
refs[i] = assert(id_map[ref])
end
end
ctx.ref_entries = nil
return elements, ctx
end
local function mark_references(tree)
local count = 0
local referenced = {}
local function recurse(objs)
for _, obj in ipairs(objs) do
if obj.ref then
for _, ref in ipairs(obj.ref) do
if not referenced[ref] then
count = count + 1
referenced[ref] = count
end
end
end
if obj.kids then
recurse(obj.kids)
end
end
end
recurse(tree)
return referenced, count
end
local function format_subtype(subtype)
if subtype.namespace then
return string.format('%s (%s)', subtype.subtype, subtype.namespace)
else
return subtype.subtype
end
end
local function format_subtype_xml(subtype)
if subtype.namespace then
return string.format('<%s xmlns="%s"', subtype.subtype,
(hide_w3c and subtype.namespace:gsub('http://www.w3.org', 'http://-www.w3.org')) or subtype.namespace)
else
return "<" .. subtype.subtype
end
end
local function print_tree(tree, ctx)
local referenced = mark_references(tree)
local function recurse(objs, first_prefix, last_first_prefix, prefix, last_prefix)
for i, obj in ipairs(objs) do
local warnings = ctx.warnings[obj]
if warnings then
print'### Warnings encountered:'
for _, warning in ipairs(warnings) do
print('# ' .. warning)
end
end
print(string.format('%sMarked content on page %i: %s', first_prefix, obj.page or -1, obj.content or ''))
if i == #objs then first_prefix, prefix = last_first_prefix, last_prefix end
if obj.type == 'MCR' then
print(string.format('%sMarked content on page %i: %s', first_prefix, obj.page or -1, obj.content or ''))
elseif obj.type == 'OBJR' then
local t = obj.Obj.Type
t = t and string.format(' of type %s', t) or ''
local page = obj.page
page = page and string.format(' on page %i', page) or '' -- TODO: Should eventually become always true
print(string.format('%sReferenced object%s%s', first_prefix, t, page))
else
local mark = obj.kids and ':' or ''
local subtype = obj.subtype
local mapped = subtype.mapped
mapped = mapped and ' / ' .. format_subtype(mapped) or ''
print(string.format('%s%s%s%s', first_prefix, format_subtype(subtype), mapped, mark))
local lines = {}
if referenced[obj] then
lines[#lines + 1] = 'Referenced as object ' .. referenced[obj]
end
if obj.title then
lines[#lines + 1] = 'Title: ' .. obj.title
end
if obj.lang then
lines[#lines + 1] = 'Language: ' .. obj.lang
end
if obj.expanded then
lines[#lines + 1] = 'Expansion: ' .. obj.expanded
end
if obj.alt then
lines[#lines + 1] = 'Alternate text: ' .. obj.alt
end
if obj.actual_text then
lines[#lines + 1] = 'Actual text: ' .. obj.actual_text
end
if obj.phoneme then
lines[#lines + 1] = 'Phoneme: ' .. obj.phoneme
end
if obj.phonetic_alphabet then
lines[#lines + 1] = 'PhoneticAlphabet: ' .. obj.phonetic_alphabet
end
if obj.associated_files then
local af_output = ''
local total_count = #af_output
for i, file in ipairs(obj.associated_files) do
if file.EF.F then
af_output = '\n└─Content: ' .. pdfe.readwholestream(file.EF.F, true):gsub('\n', '\n ')
-- recurse(obj.kids, prefix .. '├─', prefix .. '└─', prefix .. '│ ', prefix .. ' ')
end
end
lines[#lines + 1] = 'Associated files are present:' .. af_output
end
if obj.attributes then
local owners = {}
for k in next, obj.attributes do
owners[#owners + 1] = k
end
table.sort(owners)
for i=1, #owners do
local attrs = {}
for k in next, obj.attributes[owners[i]] do
attrs[#attrs + 1] = k
end
table.sort(attrs)
for j=1, #attrs do
attrs[j] = attrs[j] .. ': ' .. require'inspect'(obj.attributes[owners[i]][attrs[j]])
end
table.insert(attrs, 1, (owners[i]:sub(1, #owner_prefix) == owner_prefix and '/' .. owners[i]:sub(#owner_prefix+1) or owners[i]) .. ':')
for j=1, #attrs-1 do
attrs[j] = attrs[j]:gsub('\n', '\n│')
end
owners[i] = table.concat(attrs, '\n├', 1, #attrs-1) .. '\n└' .. attrs[#attrs]:gsub('\n', '\n ')
end
table.insert(owners, 1, 'Attributes: ')
for j=1, #owners-1 do
owners[j] = owners[j]:gsub('\n', '\n│')
end
lines[#lines + 1] = table.concat(owners, '\n├', 1, #owners-1) .. '\n└' .. owners[#owners]:gsub('\n', '\n ')
end
-- attributes = convert_attributes(elem.A),
-- attribute_classes = convert_attribute_classes(elem.C),
if obj.ref then
local refs = {}
for i, r in ipairs(obj.ref) do
refs[i] = referenced[r]
end
lines[#lines + 1] = 'References object' .. (refs[2] and 's' or '') .. ' ' .. table.concat(refs, ', ')
end
if obj.kids then
for _, l in ipairs(lines) do
print(prefix .. '┝━━' .. l:gsub('\n', '\n' .. prefix .. '│ '))
end
recurse(obj.kids, prefix .. '├─', prefix .. '└─', prefix .. '│ ', prefix .. ' ')
elseif #lines > 0 then
for i=1, #lines-1 do
print(prefix .. '┝━━' .. lines[i]:gsub('\n', '\n' .. prefix .. '│ '))
end
print(prefix .. '┕━━' .. lines[#lines]:gsub('\n', '\n' .. prefix .. ' '))
end
end
end
end
return recurse(tree, '', '', '', '')
end
local function print_tree_xml(tree, ctx)
local referenced = mark_references(tree)
local function recurse(objs, indent)
for i, obj in ipairs(objs) do
local warnings = ctx.warnings[obj]
if warnings then
for _, warning in ipairs(warnings) do
print(string.format('%s', indent, warning))
end
end
if obj.type == 'MCR' then
print(string.format('%s%s', indent, obj.page or -1, (obj.content and obj.content:gsub('&','&'):gsub('<','<'):gsub('\0','[NULL]'):gsub('[\1-\8\11\12\14-\31]','[CTRL]'):gsub('�.*','[TEXT]') or "[missing]")))
elseif obj.type == 'OBJR' then
local t = obj.Obj.Type
t = t and string.format(' type="%s"', t) or ''
local page = obj.page
page = page and string.format(' page="%i"', page) or '' -- TODO: Should eventually become always true
print(string.format('%s', indent, t, page))
else
local subtype = obj.subtype
local mapped = subtype.mapped
-- mapped = mapped and mapped.subtype or ''
-- mapped = mapped and ' / ' .. format_subtype(mapped) or ''
if follow_rolemap and mapped then
print(string.format('%s%s', indent, format_subtype_xml(mapped)))
else
print(string.format('%s%s', indent, format_subtype_xml(subtype)))
end
local lines = {}
if obj.id then
lines[#lines + 1] = ' id="' .. obj.id:gsub('&','&'):gsub('<','<'):gsub('"','"'):gsub('\0','[NULL]') .. '"'
end
if obj.title then
lines[#lines + 1] = ' title="' .. obj.title:gsub('&','&'):gsub('<','<'):gsub('"','"'):gsub('\0','[NULL]'):gsub('[\1-\8\11\12\14-\31]','[CTRL]') .. '"'
end
if obj.lang then
lines[#lines + 1] = ' lang="' .. obj.lang .. '"'
end
if obj.expanded then
lines[#lines + 1] = ' expansion="' .. obj.expanded:gsub('&','&'):gsub('<','<'):gsub('"','"'):gsub('\0','[NULL]'):gsub('[\1-\8\11\12\14-\31]','[CTRL]') .. '"'
end
if obj.alt then
lines[#lines + 1] = ' alt="' .. obj.alt:gsub('&','&'):gsub('<','<'):gsub('"','"'):gsub('\0','[NULL]'):gsub('[\1-\8\11\12\14-\31]','[CTRL]') .. '"'
end
if obj.actual_text then
lines[#lines + 1] = ' actualtext="' .. obj.actual_text:gsub('&','&'):gsub('<','<'):gsub('"','"'):gsub('\0','[NULL]'):gsub('[\1-\8\11\12\14-\31]','[CTRL]') .. '"'
end
if obj.phoneme then
lines[#lines + 1] = ' phoneme="' .. obj.phoneme:gsub('&','&'):gsub('<','<'):gsub('"','"'):gsub('\0','[NULL]'):gsub('[\1-\8\11\12\14-\31]','[CTRL]') .. '"'
end
if obj.phonetic_alphabet then
lines[#lines + 1] = ' phonetic-alphabet="' .. obj.phonetic_alphabet:gsub('&','&'):gsub('<','<'):gsub('"','"'):gsub('\0','[NULL]'):gsub('[\1-\8\11\12\14-\31]','[CTRL]') .. '"'
end
if obj.associated_files then
local f = {}
local warnings = {}
for i, file in ipairs(obj.associated_files) do
if file.EF.F then
f[#f+1] = get_string(file, "UF", warnings)
end
end
for _, warning in ipairs(warnings) do
io.stderr:write('Warning while processing associated files: ' .. warning .. '\n')
end
lines[#lines + 1] = ' af="' .. table.concat(f, ' ') .. '"'
end
if obj.attributes then
for k,v in ordered_pairs(obj.attributes) do
local attrns=""
if k~=subtype.namespace then
attrns = k:gsub('.*/','')
end
if type(v) == "table" then
if attrns ~= "" then
lines[#lines +1] = ' xmlns:' .. attrns .. '="' .. k .. '"'
attrns = attrns ..':'
end
for kk,vv in ordered_pairs(v) do
if type(vv) == "table" then
vv = require'inspect'(vv):gsub('\n[ ]*',' ')
end
lines[#lines+1] = ' ' ..attrns .. kk .. '="' .. tostring(vv):gsub('&','&'):gsub('<','<'):gsub('"','"'):gsub('\0','[NULL]') .. '"'
end
else
io.stderr:write("Unexpected attributes object\n")
end
end
end
if mapped and mapped.subtype then
if follow_rolemap then
if subtype.namespace then
lines[#lines+1] = ' xmlns:orig-ns="' .. subtype.namespace .. '"'
lines[#lines+1] = ' rolemapped-from="orig-ns:' .. subtype.subtype .. '"'
else
lines[#lines+1] = ' rolemapped-from="' .. subtype.subtype .. '"'
end
else
lines[#lines+1] = ' rolemaps-to="' .. mapped.subtype .. '"'
end
end
-- attributes = convert_attributes(elem.A),
-- attribute_classes = convert_attribute_classes(elem.C),
if referenced[obj] then
lines[#lines + 1] = ' referenced-as="' .. referenced[obj] .. '"'
end
lines[#lines+1] = ">"
if obj.ref then
local refs = {}
for i, r in ipairs(obj.ref) do
refs[i] = referenced[r]
end
lines[#lines + 1] = ''
end
if obj.kids then
for _, l in ipairs(lines) do
print(indent .. ' ' .. l:gsub('\n', '\n' .. indent .. ' '))
end
recurse(obj.kids, indent .. ' ')
if follow_rolemap and mapped then
print(indent .. "" .. mapped.subtype ..">")
else
print(indent .. "" .. subtype.subtype ..">")
end
elseif #lines > 0 then
for i=1, #lines-1 do
print(indent .. ' ' .. lines[i]:gsub('\n', '\n' .. indent .. ' '))
end
print(indent .. ' ' .. lines[#lines]:gsub('\n', '\n' .. indent .. ' '))
if follow_rolemap and mapped then
print(indent .. "" .. mapped.subtype ..">")
else
print(indent .. "" .. subtype.subtype ..">")
end
end
end
end
end
if #tree == 1 then
return recurse(tree, '', '', '', '')
else
print""
print ("")
recurse(tree, ' ', '', '', '')
print ("")
return
end
end
local helpstr =[[
Usage: %s options .pdf
Options
--help|-h show this help
--version|-v show the current version
--tree (default) show as tree
--xml show as XML
--table show Lua table structure
--map Follow role mapping (xml printer)
--w3c- Add - to w3c namespaces to force browser tree display
]]
local argi = 1
while argi <= #arg and arg[argi]:match("^%-") do
if arg[argi] == "--tree" then
out_format="tree"
elseif arg[argi] == "--xml" then
out_format="xml"
elseif arg[argi] == "--table" then
out_format="table"
elseif arg[argi] == "--map" then
follow_rolemap=true
elseif arg[argi] == "--w3c-" then
hide_w3c=true
elseif arg[argi] == "--help" or arg[argi] == "-h" then
io.stderr:write(string.format(helpstr, arg[0]))
return
elseif arg[argi] == "--version" or arg[argi] == "-v" then
io.stderr:write(string.format("show-pdf-tags version: %s\n", show_pdf_tags_version))
return
else
io.stderr:write(string.format('Unknown option: %s\n', arg[argi]))
return
end
argi=argi+1
end
if argi < #arg then
io.stderr:write(string.format('Extra argument. Usage: %s options .pdf\n', arg[0]))
return
end
if argi > #arg then
io.stderr:write(string.format('Missing argument. Usage: %s options .pdf\n', arg[0]))
return
end
local struct, ctx = assert(open(arg[argi]))
if out_format=="tree" then
print_tree(struct, ctx)
else
if out_format=="xml" then
print_tree_xml(struct, ctx)
else
print(require'inspect'(struct))
end
end