Created
January 27, 2020 00:29
-
-
Save lgessler/8cf668e4751df821a7f2388c686343d8 to your computer and use it in GitHub Desktop.
instaparse parser for CONLLU format
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(ns erracle.conll.parse | |
(:require [clojure.java.io :as io] | |
[instaparse.core :as insta])) | |
(def conllu-parser | |
(insta/parser | |
;; This is a parser for the CONLL-U format. Only the vanilla 10 columns are supported. | |
;; This aims to be complete for the specification as it existed in January 2020. | |
;; The result is this parser is a seq of 11-item vectors, the first being a keyword | |
;; indicating the line's type, and the remaining columns containing the CONLLU data. | |
;; Cf. https://universaldependencies.org/format.html | |
;; This parser does not enforce certain constraints, e.g. that feature names | |
;; conform to the pattern #'[A-Z0-9][A-Z0-9a-z]*(\\[[a-z0-9]+\\])?' | |
" | |
<file> ::= line* | |
<line> ::= comment-line | |
| token-line | |
| supertoken-line | |
| ellipsis-line | |
| <empty-line> | |
<EOL> ::= <#'\\r\\n'> | |
| <#'\\r'> | |
| <#'\\n'> | |
| <#'\\Z'> (* \\Z means EOF *) | |
(* cool [^\\S\\r\\n] pattern from https://stackoverflow.com/questions/3469080/match-whitespace-but-not-newlines *) | |
empty-line ::= #'[^\\S\\r\\n]*' EOL | |
comment-line ::= #'#[^\\n]*' EOL | |
token-line ::= id-token <'\t'> 9-columns EOL | |
supertoken-line ::= id-supertoken <'\t'> 9-columns EOL | |
ellipsis-line ::= id-ellipsis <'\t'> 9-columns EOL | |
(* columns ****************************************************************************) | |
(* column 1 used to differentiate lines: supertoken, ellipsis, and token lines are all | |
* distinguished based on their IDs. | |
*) | |
id-token ::= #'\\d+' | |
id-supertoken ::= #'\\d+-\\d+' | |
id-ellipsis ::= #'\\d+\\.\\d+' | |
<9-columns> ::= form | |
<'\t'> lemma | |
<'\t'> upos | |
<'\t'> xpos | |
<'\t'> feats | |
<'\t'> head | |
<'\t'> deprel | |
<'\t'> deps | |
<'\t'> misc | |
<underscore-or-not-tab> ::= '_' | #'[^\t]+' | |
<underscore-or-not-tab-or-whitespace> ::= '_' | #'[^\t\\s]+' | |
<underscore-or-digits> ::= '_' | #'\\d+' | |
form ::= underscore-or-not-tab | |
lemma ::= underscore-or-not-tab | |
upos ::= underscore-or-not-tab-or-whitespace | |
xpos ::= underscore-or-not-tab-or-whitespace | |
head ::= underscore-or-digits | |
deprel ::= underscore-or-not-tab-or-whitespace | |
(* e.g., `Case=Nom|NumType=Card` *) | |
feats ::= '_' | feat-list | |
<feat-list> ::= ε | feat feat-tail | |
<feat-tail> ::= ε | <'|'> feat feat-tail | |
feat ::= #'\\w+' <'='> #'\\w+' | |
deps ::= '_' | dep-list | |
<dep-list> ::= ε | dep dep-tail | |
<dep-tail> ::= ε | <'|'> dep dep-tail | |
dep ::= #'\\d+' <':'> #'\\w+' | |
misc ::= '_' | misc-list | |
<misc-list> ::= ε | misc-item misc-tail | |
<misc-tail> ::= ε | <'|'> misc-item misc-tail | |
misc-item ::= #'\\w+' <'='> #'\\w+' | |
")) | |
(conllu-parser " | |
5\tform\tlemma\tupos\txpos\tA=B\t1\tqwe\t_\t_ | |
") | |
=> | |
([:token-line | |
[:id-token "5"] | |
[:form "form"] | |
[:lemma "lemma"] | |
[:upos "upos"] | |
[:xpos "xpos"] | |
[:feats [:feat "A" "B"]] | |
[:head "1"] | |
[:deprel "qwe"] | |
[:deps "_"] | |
[:misc "_"]] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment