Place ud-treebanks-v2.0
folder here before you run the examples. You can download the ud-treebanks-v2.0.tgz file here. Also run npm install
to install dependencies.
You can read the full blog post here.
Place ud-treebanks-v2.0
folder here before you run the examples. You can download the ud-treebanks-v2.0.tgz file here. Also run npm install
to install dependencies.
You can read the full blog post here.
var conllu = require('conllu-stream'); | |
var fs = require('fs'); | |
fs.createReadStream('ud-treebanks-v2.0/UD_German/de-ud-train.conllu') | |
.pipe(conllu()) | |
.on('data', sentence => { | |
console.log(sentence.features.sent_id, sentence.toString()); | |
}); |
var _ = require('lodash'); | |
var conllu = require('conllu-stream'); | |
var fs = require('fs'); | |
// Function to print value as percent (nicely). | |
function percent(value) { | |
value *= 100; | |
return isNaN(value) ? '' : | |
value > 10 ? value.toPrecision(3)+'%' : | |
value > 1 ? value.toPrecision(2)+'%' : | |
value.toPrecision(1)+'%'; | |
} | |
// Function to calculate and display histogram. | |
// It first calculates the histogram of the primary `key`. | |
// For each key it calculates a secondary histogram of the seconday `linkKey`. | |
function displayHistogram(words, key, linkKey) { | |
// Calculate `key` histogram of words, sorted by the frequenzy. | |
var grouped = _.groupBy(words, key); | |
var histogram = _(grouped) | |
.mapValues('length') | |
.toPairs() | |
.sortBy([ 1, 0 ]) | |
.reverse() | |
.value(); | |
// Get top-10 and bottom-10 parts of the histogram. | |
var top = histogram.slice(0, 10); | |
var bottom = histogram.slice(-10); | |
// Combine top and bottom parts to display. | |
var entries = top.concat([['--']]).concat(bottom); | |
// For each entry, calculate top-6 of secondary `linkKey` histogram. | |
entries.forEach(entry => { | |
entry[2] = percent(entry[1] / words.length); | |
var linked = _.map(grouped[entry[0]], linkKey); | |
entry[3] = | |
_(linked) | |
.groupBy() | |
.mapValues('length') | |
.toPairs() | |
.sortBy([ 1, 0 ]) | |
.reverse() | |
// Show percent values for each item. | |
.map(item => `${item[0]} (${percent(item[1]/linked.length)})`) | |
.slice(0, 6) | |
.join(', '); | |
}); | |
// Display table of results. | |
console.log('-- %s --', key); | |
console.log(); | |
console.log(entries.map(entry => entry.join('\t')).join('\n')); | |
console.log(); | |
console.log('#words :', words.length); | |
console.log('#histogram :', histogram.length); | |
console.log(); | |
} | |
// Array to store all word objects we encounter. | |
var words = []; | |
// Parse CoNLL-U file. | |
fs.createReadStream('ud-treebanks-v2.0/UD_German/de-ud-train.conllu') | |
.pipe(conllu()) | |
.on('data', sentence => { | |
// Collect all words and lemmas in lowercase (except punctuation/numbers). | |
sentence.getSequence() | |
.filter(word => [ 'PUNCT', 'NUM' ].indexOf(word.upostag) === -1) | |
.forEach(word => { | |
// Using lowercase so "Haus" and "haus" are counted together. | |
word.form = word.form.toLowerCase(); | |
word.lemma = word.lemma.toLowerCase(); | |
words.push(word); | |
}); | |
}) | |
.on('end', () => { | |
// Calculate and display histograms of words vs. lemmas and vice versa. | |
displayHistogram(words, 'form', 'lemma'); | |
displayHistogram(words, 'lemma', 'form'); | |
console.log('-- done --'); | |
}); |
var _ = require('lodash'); | |
var conllu = require('conllu-stream'); | |
var fs = require('fs'); | |
var multiwords = []; | |
fs.createReadStream('ud-treebanks-v2.0/UD_German/de-ud-train.conllu') | |
.pipe(conllu()) | |
.on('data', sentence => { | |
// Collect all words and lemmas in lowercase. | |
sentence.structure.multiwords | |
.map(id => sentence.tokens[id]) | |
.forEach(multiword => { | |
// Get expanded form of the multiword. | |
var expansion = | |
_.range(multiword.position, multiword.endPosition+1) | |
.map(id => sentence.tokens[''+id].form) | |
.join(' '); | |
// Store multiword and its expansion. | |
multiwords.push(multiword.form.toLowerCase() + | |
'\t-->\t' + expansion.toLowerCase()); | |
}); | |
}) | |
.on('end', () => { | |
// Calculate and show histogram sorted by frequency. | |
console.log( | |
_(multiwords) | |
.groupBy() | |
.mapValues('length') | |
.toPairs() | |
.sortBy([ 1, 0 ]) | |
.reverse() | |
.map(row => row.join('\t\t')) | |
.join('\n') | |
); | |
}); |
{ | |
"dependencies": { | |
"conllu-stream": "0.0.1", | |
"lodash": "^4.17.4" | |
} | |
} |
# sent_id = train-s2 | |
# text = Die Kosten sind definitiv auch im Rahmen. | |
#id form lemma upostag xpostag feats head deprel deps misc | |
1 Die der DET ART Case=Nom|Definite=Def|Gender=Fem|Number=Sing|PronType=Art 2 det _ _ | |
2 Kosten Kosten NOUN NN Case=Nom|Gender=Fem|Number=Sing 3 nsubj:pass _ _ | |
3 sind sein VERB VAFIN Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 0 root _ _ | |
4 definitiv definitiv ADV ADJD _ 3 advmod _ _ | |
5 auch auch ADV ADV _ 3 advmod _ _ | |
6-7 im _ _ _ _ _ _ _ _ | |
6 in in ADP APPR _ 8 case _ _ | |
7 dem der DET ART Case=Dat|Definite=Def|Gender=Masc,Neut|Number=Sing|PronType=Art 8 det _ _ | |
8 Rahmen Rahmen NOUN NN Case=Dat|Gender=Masc,Neut|Number=Sing 3 obl _ SpaceAfter=No | |
9 . . PUNCT $. _ 3 punct _ _ |