Created
April 23, 2020 04:26
-
-
Save hiepph/415e02d073e0c3df1ef2a6a80e06a313 to your computer and use it in GitHub Desktop.
Tabnet preparation
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- Prepare {src,tgt}-train.txt for OpenNMT Image2Text | |
-- labels (json) >> prepareDataOpenNMT.hs >> src+tgt+vocab.txt (io) | |
-- src: tgt | |
-- 123.jpg <tr><td></td> | |
-- 456.jpg <tr colspan=2 ><td></td> | |
-- | |
-- vocab: | |
-- <td | |
-- colspan="2" | |
-- > | |
-- </td> | |
-- | |
-- Usage: | |
-- prepareDataOpenNMT.hs labels.json {train,val} | |
{-# LANGUAGE OverloadedStrings #-} | |
import Data.Char | |
import Data.List | |
import qualified Data.Map as Map | |
import Data.Aeson | |
import System.Environment | |
import Control.Applicative | |
import Control.Monad (mzero) | |
import qualified Data.ByteString.Lazy as B | |
data Label = Label | |
{ images :: [Image] | |
} deriving Show | |
instance FromJSON Label where | |
parseJSON (Object o) = Label | |
<$> (o .: "images") | |
parseJSON _ = mzero | |
data Image = Image | |
{ split :: String | |
, filename :: String | |
, structure :: [String] | |
, cells :: [Token] | |
} deriving Show | |
instance FromJSON Image where | |
parseJSON (Object o) = Image | |
<$> (o .: "split") | |
<*> (o .: "filename") | |
<*> ((o .: "html") >>= (.: "structure") >>= (.: "tokens")) | |
<*> ((o .: "html") >>= (.: "cells")) | |
parseJSON _ = mzero | |
data Token = Token | |
{ tokens :: [String] | |
} deriving Show | |
instance FromJSON Token where | |
parseJSON (Object o) = Token | |
<$> (o .: "tokens") | |
parseJSON _ = mzero | |
main :: IO () | |
main = do | |
args <- getArgs | |
input <- B.readFile $ head args | |
let label = decode input :: Maybe Label | |
case label of | |
Nothing -> print "error parsing JSON" | |
Just l -> do | |
exportSrc imgs | |
exportTgt imgs | |
case dist of | |
"train" -> exportVocab imgs | |
"val" -> return () | |
where dist = last args | |
imgs = filter (\im -> split im == dist) (images l) | |
exportSrc = writeFile | |
("src-" ++ dist ++ ".txt") | |
. intercalate "\n" . map filename | |
exportTgt imgs = do | |
-- get corresponding structure and cells | |
let sts = map (map trim) $ map structure $ imgs | |
let cls = map (map (intercalate "")) $ map (map tokens) $ map cells $ imgs | |
-- update new value | |
let table = map (intercalate " ") $ map update $ zip sts cls | |
writeFile ("tgt-" ++ dist ++ ".txt") $ intercalate "\n" table | |
exportVocab imgs = writeFile | |
"vocab.txt" | |
$ intercalate "\n" $ (sort $ map trim $ nub $ concat $ map structure imgs) ++ ["y", "n"] | |
trim :: String -> String | |
trim = f . f | |
f = reverse . dropWhile isSpace | |
tdIndex :: [String] -> [Int] | |
tdIndex = map (fst) . filter ((`elem` ["<td>", ">"]) . snd) . zip [0..] | |
booleanCell :: [String] -> [String] | |
booleanCell = map (\v -> if length v > 0 then "y" else "n") | |
update :: ([String], [String]) -> [String] | |
update (st, cl) = let | |
is = tdIndex st | |
vs = booleanCell cl | |
m = Map.fromList $ zip is vs | |
in map (\i -> case Map.lookup i m of {Just v -> (st !! i) ++ " " ++ v; Nothing -> (st !! i)}) [0..(length st) - 1] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment