Created
May 11, 2016 15:57
-
-
Save DoggettCK/787a067d3ac6ad6222e332010ed53f3f to your computer and use it in GitHub Desktop.
Preliminary Neotomex PDF object grammar. Still has some bugs around object name edge cases.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
defmodule PdfObjectGrammar do | |
use Neotomex.ExGrammar | |
defp describe_object(o, prefix) do | |
IO.puts "#{prefix} received" | |
IO.inspect o | |
end | |
@root true | |
define :object, "bool / numeric_object / string_object / name / array / dictionary" | |
define :dictionary, "<'<<'> <space*> (name_object <space*>)* <space*> <'>>'> <space*>" do | |
dicts -> dicts |> List.flatten |> Enum.reduce(%{}, fn (x, acc) -> Dict.merge(acc, x) end) | |
end | |
define :name, "'/' [a-zA-Z0-9#_+*]*" do | |
# TODO: This doesn't properly handle cases where there is no space between name definitions | |
# (e.g. /Type/Page), which shouldn't be valid anyway, but some PDF creators are stupid | |
s -> Enum.join(s) | |
end | |
define :name_object, "name <space*> (dictionary / name / string_object / bool / object_reference / numeric_object / array ) <space*>" do | |
[key, value] -> %{} |> Dict.put(key, value) | |
end | |
define :array, "<'['> <space*> (dictionary / name / string_object / bool / object_reference / numeric_object / array / <space>)+ <']'>" do | |
[arr] -> arr |> Enum.reject(&is_nil/1) | |
end | |
define :object_reference, "integer <space> integer <space> <'R'>" do | |
[obj_id, gen_id] -> %{ | |
"object_id" => obj_id, | |
"generation_id" => gen_id | |
} | |
end | |
define :string_object, "hex_string_object / regular_string_object" | |
define :regular_string_object, "paren_wrapped_string" do | |
# This is just to remove outermost parentheses, while allowing nested parentheses to remain as part of literal string | |
string -> Regex.scan(~r/\((.*)\)/, string, capture: :all_but_first) |> Enum.join | |
end | |
define :paren_wrapped_string, "'(' ([^\(\)] / paren_wrapped_string)* ')'" do | |
# Dumb, but this is how to handle nested parentheses in a string that's delimited with parentheses | |
chars -> chars |> Enum.join | |
end | |
define :hex_string_object, "<'<'> hex_string <'>'>" do | |
[hso] -> hso | |
end | |
define :bool, "'true' / 'false'" do | |
"false" -> false | |
"true" -> true | |
end | |
define :hex_string, "[a-fA-F0-9]+" do | |
chars -> chars |> Enum.join |> hex_to_binary | |
end | |
define :numeric_object, "('+' / '-')? (float / integer)" do | |
["-", number] -> -number | |
[_, number] -> number | |
n -> n | |
end | |
define :float, "integer <'.'> integer" do | |
[i, m] -> "#{i}.#{m}" |> String.to_float | |
end | |
define :integer, "[0-9]+" do | |
i -> i |> Enum.join |> String.to_integer | |
end | |
define :space, "[ \\r\\n\\s\\t]" | |
# String utilities | |
defp hex_to_binary(hex_string) do | |
hex_string | |
|> to_char_list | |
|> Enum.chunk(2, 2, [0]) | |
|> hex_to_binary("") | |
|> decode_string | |
end | |
defp hex_to_binary([], result), do: result | |
defp hex_to_binary([[c, 0]], result), do: result <> ("#{c}0" |> String.to_integer(16)) | |
defp hex_to_binary([pair | rest], result) do | |
hex_to_binary(rest, result <> <<(pair |> to_string |> String.to_integer(16))>>) | |
end | |
defp decode_string(str) when is_binary(str) do | |
{ encoding, bom_length } = :unicode.bom_to_encoding(str) | |
<< _bom::binary-size(bom_length), str::binary >> = str | |
case encoding do | |
{:utf16, :big} -> :unicode.characters_to_binary(str, encoding, :utf8) | |
{:utf16, :little} -> :unicode.characters_to_binary(str, encoding, :utf8) | |
:latin1 -> :unicode.characters_to_binary(str, encoding, :utf8) | |
{:utf32, endianness} -> :unicode.characters_to_binary(<<0, 0>> <> str, {:utf16, endianness}, :utf8) | |
_ -> str | |
end | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment