lgessler · January 27, 2020 00:29
diff --git a/instaconllu.clj b/instaconllu.clj
 (ns erracle.conll.parse
  (:require [clojure.java.io :as io]
            [instaparse.core :as insta]))

 (def conllu-parser
  (insta/parser
    ;; This is a parser for the CONLL-U format. Only the vanilla 10 columns are supported.
    ;; This aims to be complete for the specification as it existed in January 2020.
    ;; The result is this parser is a seq of 11-item vectors, the first being a keyword
    ;; indicating the line's type, and the remaining columns containing the CONLLU data.
    ;; Cf. https://universaldependencies.org/format.html
    ;; This parser does not enforce certain constraints, e.g. that feature names
    ;; conform to the pattern #'[A-Z0-9][A-Z0-9a-z]*(\\[[a-z0-9]+\\])?'
    "
    <file> ::= line*
    <line> ::= comment-line
             | token-line
             | supertoken-line
             | ellipsis-line
             | <empty-line>

    <EOL> ::= <#'\\r\\n'>
            | <#'\\r'>
            | <#'\\n'>
            | <#'\\Z'> (* \\Z means EOF *)

    (* cool [^\\S\\r\\n] pattern from https://stackoverflow.com/questions/3469080/match-whitespace-but-not-newlines *)
    empty-line   ::= #'[^\\S\\r\\n]*' EOL
    comment-line ::= #'#[^\\n]*' EOL
    token-line      ::= id-token      <'\t'> 9-columns EOL
    supertoken-line ::= id-supertoken <'\t'> 9-columns EOL
    ellipsis-line   ::= id-ellipsis   <'\t'> 9-columns EOL

    (* columns ****************************************************************************)
    (* column 1 used to differentiate lines: supertoken, ellipsis, and token lines are all
     * distinguished based on their IDs.
     *)
    id-token      ::= #'\\d+'
    id-supertoken ::= #'\\d+-\\d+'
    id-ellipsis   ::= #'\\d+\\.\\d+'

    <9-columns> ::= form
             <'\t'> lemma
             <'\t'> upos
             <'\t'> xpos
             <'\t'> feats
             <'\t'> head
             <'\t'> deprel
             <'\t'> deps
             <'\t'> misc

    <underscore-or-not-tab>               ::= '_' | #'[^\t]+'
    <underscore-or-not-tab-or-whitespace> ::= '_' | #'[^\t\\s]+'
    <underscore-or-digits>                ::= '_' | #'\\d+'

    form   ::= underscore-or-not-tab
    lemma  ::= underscore-or-not-tab
    upos   ::= underscore-or-not-tab-or-whitespace
    xpos   ::= underscore-or-not-tab-or-whitespace
    head   ::= underscore-or-digits
    deprel ::= underscore-or-not-tab-or-whitespace

    (* e.g., `Case=Nom|NumType=Card` *)
    feats       ::= '_' | feat-list
    <feat-list> ::= ε | feat feat-tail
    <feat-tail> ::= ε | <'|'> feat feat-tail
    feat        ::= #'\\w+' <'='> #'\\w+'

    deps        ::= '_' | dep-list
    <dep-list>  ::= ε | dep dep-tail
    <dep-tail>  ::= ε | <'|'> dep dep-tail
    dep         ::= #'\\d+' <':'> #'\\w+'

    misc        ::= '_' | misc-list
    <misc-list> ::= ε | misc-item misc-tail
    <misc-tail> ::= ε | <'|'> misc-item misc-tail
    misc-item   ::= #'\\w+' <'='> #'\\w+'
    "))

 (conllu-parser "
 5\tform\tlemma\tupos\txpos\tA=B\t1\tqwe\t_\t_
 ")
 =>
 ([:token-line
  [:id-token "5"]
  [:form "form"]
  [:lemma "lemma"]
  [:upos "upos"]
  [:xpos "xpos"]
  [:feats [:feat "A" "B"]]
  [:head "1"]
  [:deprel "qwe"]
  [:deps "_"]
  [:misc "_"]]
	(ns erracle.conll.parse
	(:require [clojure.java.io :as io]
	[instaparse.core :as insta]))

	(def conllu-parser
	(insta/parser
	;; This is a parser for the CONLL-U format. Only the vanilla 10 columns are supported.
	;; This aims to be complete for the specification as it existed in January 2020.
	;; The result is this parser is a seq of 11-item vectors, the first being a keyword
	;; indicating the line's type, and the remaining columns containing the CONLLU data.
	;; Cf. https://universaldependencies.org/format.html
	;; This parser does not enforce certain constraints, e.g. that feature names
	;; conform to the pattern #'[A-Z0-9][A-Z0-9a-z]*(\\[[a-z0-9]+\\])?'
	"
	<file> ::= line*
	<line> ::= comment-line
	\| token-line
	\| supertoken-line
	\| ellipsis-line
	\| <empty-line>

	<EOL> ::= <#'\\r\\n'>
	\| <#'\\r'>
	\| <#'\\n'>
	\| <#'\\Z'> (* \\Z means EOF *)

	(* cool [^\\S\\r\\n] pattern from https://stackoverflow.com/questions/3469080/match-whitespace-but-not-newlines *)
	empty-line ::= #'[^\\S\\r\\n]*' EOL
	comment-line ::= #'#[^\\n]*' EOL
	token-line ::= id-token <'\t'> 9-columns EOL
	supertoken-line ::= id-supertoken <'\t'> 9-columns EOL
	ellipsis-line ::= id-ellipsis <'\t'> 9-columns EOL

	(* columns ****************************************************************************)
	(* column 1 used to differentiate lines: supertoken, ellipsis, and token lines are all
	* distinguished based on their IDs.
	*)
	id-token ::= #'\\d+'
	id-supertoken ::= #'\\d+-\\d+'
	id-ellipsis ::= #'\\d+\\.\\d+'

	<9-columns> ::= form
	<'\t'> lemma
	<'\t'> upos
	<'\t'> xpos
	<'\t'> feats
	<'\t'> head
	<'\t'> deprel
	<'\t'> deps
	<'\t'> misc

	<underscore-or-not-tab> ::= '_' \| #'[^\t]+'
	<underscore-or-not-tab-or-whitespace> ::= '_' \| #'[^\t\\s]+'
	<underscore-or-digits> ::= '_' \| #'\\d+'

	form ::= underscore-or-not-tab
	lemma ::= underscore-or-not-tab
	upos ::= underscore-or-not-tab-or-whitespace
	xpos ::= underscore-or-not-tab-or-whitespace
	head ::= underscore-or-digits
	deprel ::= underscore-or-not-tab-or-whitespace

	(* e.g., `Case=Nom\|NumType=Card` *)
	feats ::= '_' \| feat-list
	<feat-list> ::= ε \| feat feat-tail
	<feat-tail> ::= ε \| <'\|'> feat feat-tail
	feat ::= #'\\w+' <'='> #'\\w+'

	deps ::= '_' \| dep-list
	<dep-list> ::= ε \| dep dep-tail
	<dep-tail> ::= ε \| <'\|'> dep dep-tail
	dep ::= #'\\d+' <':'> #'\\w+'

	misc ::= '_' \| misc-list
	<misc-list> ::= ε \| misc-item misc-tail
	<misc-tail> ::= ε \| <'\|'> misc-item misc-tail
	misc-item ::= #'\\w+' <'='> #'\\w+'
	"))

	(conllu-parser "
	5\tform\tlemma\tupos\txpos\tA=B\t1\tqwe\t_\t_
	")
	=>
	([:token-line
	[:id-token "5"]
	[:form "form"]
	[:lemma "lemma"]
	[:upos "upos"]
	[:xpos "xpos"]
	[:feats [:feat "A" "B"]]
	[:head "1"]
	[:deprel "qwe"]
	[:deps "_"]
	[:misc "_"]]