Skip to content

Instantly share code, notes, and snippets.

@kleem
Last active August 29, 2015 14:02
Show Gist options
  • Save kleem/7bd64c5ff480d85c922b to your computer and use it in GitHub Desktop.
Save kleem/7bd64c5ff480d85c922b to your computer and use it in GitHub Desktop.
Linguistic annotations

An experiment on visualizing linguistic annotations of a (small) corpus.

The example uses a nonstandard, super-simple JSON format coded by hand (please forgive me for the errors I surely made from a linguistic standpoint).

This visualization focuses on three different aspects of the analysis: sentence splitting (a gray ■ introduces a new sentence), tokenization and lemmatization (each token has an underline and its lemma written under it) and part-of-speech tagging (the color of the underline and the lemma indicates whether the term is a noun, a verb, etc.).

The original text's spacing, punctuation and line breaking is preserved, as it can be seen by the last two lines.

Various CSS hacks with line heights, relative positioning and stuff are used to create this layout, so functionalities like text selection and similar are broken.

[
[
{"token":"Halley's Comet" , "lemma":"Halley's Comet", "pos":"noun" , "ne": {"class": "astronomical_object", "id": "http://en.wikipedia.org/wiki/Halley%27s_Comet"}},
{"token":" "},
{"token":"or" , "lemma":"or" , "pos":"conjunction"},
{"token":" "},
{"token":"Comet Halley" , "lemma":"Comet Halley" , "pos":"noun" , "ne": {"class": "astronomical_object", "id": "http://en.wikipedia.org/wiki/Halley%27s_Comet"}},
{"token":" "},
{"token":"is" , "lemma":"be" , "pos":"verb"},
{"token":" "},
{"token":"the" , "lemma":"the" , "pos":"article"},
{"token":" "},
{"token":"best-known" , "lemma":"best-known" , "pos":"adjective"},
{"token":" "},
{"token":"of" , "lemma":"of" , "pos":"preposition"},
{"token":" "},
{"token":"the" , "lemma":"the" , "pos":"article"},
{"token":" "},
{"token":"short-period" , "lemma":"short-period" , "pos":"adjective"},
{"token":" "},
{"token":"comets" , "lemma":"comet" , "pos":"noun"},
{"token":" "},
{"token":"and" , "lemma":"and" , "pos":"conjunction"},
{"token":" "},
{"token":"is" , "lemma":"be" , "pos":"verb"},
{"token":" "},
{"token":"visible" , "lemma":"visible" , "pos":"adjective"},
{"token":" "},
{"token":"from" , "lemma":"from" , "pos":"preposition"},
{"token":" "},
{"token":"Earth" , "lemma":"Earth" , "pos":"noun" , "ne": {"class": "astronomical_object", "id": "http://en.wikipedia.org/wiki/Earth"}},
{"token":" "},
{"token":"every" , "lemma":"every" , "pos":"adverb"},
{"token":" "},
{"token":"75" , "lemma":"75" , "pos":"adjective"},
{"token":""},
{"token":"76" , "lemma":"76" , "pos":"adjective"},
{"token":" "},
{"token":"years" , "lemma":"year" , "pos":"noun"},
{"token":".\n"}
],[
{"token":"Halley's Comet" , "lemma":"Halley's Comet", "pos":"noun" , "ne": {"class": "astronomical_object", "id": "http://en.wikipedia.org/wiki/Halley%27s_Comet"}},
{"token":" "},
{"token":"returns" , "lemma":"return" , "pos":"noun"},
{"token":" "},
{"token":"to" , "lemma":"to" , "pos":"preposition"},
{"token":" "},
{"token":"the" , "lemma":"the" , "pos":"article"},
{"token":" "},
{"token":"inner" , "lemma":"inner" , "pos":"adjective"},
{"token":" "},
{"token":"Solar System" , "lemma":"Solar System" , "pos":"noun" , "ne": {"class": "astronomical_object", "id": "http://en.wikipedia.org/wiki/Solar_System"}},
{"token":" "},
{"token":"have been observed", "lemma":"observe" , "pos":"verb"},
{"token":" "},
{"token":"and" , "lemma":"and" , "pos":"conjunction"},
{"token":" "},
{"token":"recorded" , "lemma":"record" , "pos":"verb"},
{"token":" "},
{"token":"by" , "lemma":"by" , "pos":"preposition"},
{"token":" "},
{"token":"astronomers" , "lemma":"astronomer" , "pos":"noun"},
{"token":" "},
{"token":"since" , "lemma":"since" , "pos":"preposition"},
{"token":" "},
{"token":"at least" , "lemma":"at least" , "pos":"adverb"},
{"token":" "},
{"token":"240 BCE" , "lemma":"240 BCE" , "pos":"noun" , "ne": {"class": "date", "id": "-239"}},
{"token":".\n"}
],[
{"token":"The" , "lemma":"the" , "pos":"article"},
{"token":" "},
{"token":"comet's" , "lemma":"comet" , "pos":"noun"},
{"token":" "},
{"token":"periodicity" , "lemma":"periodicity" , "pos":"noun"},
{"token":" "},
{"token":"was determined" , "lemma":"determine" , "pos":"verb"},
{"token":" "},
{"token":"in" , "lemma":"in" , "pos":"preposition"},
{"token":" "},
{"token":"1705" , "lemma":"1705" , "pos":"noun" , "ne": {"class": "date", "id": "1705"}},
{"token":" "},
{"token":"by" , "lemma":"by" , "pos":"preposition"},
{"token":" "},
{"token":"English" , "lemma":"English" , "pos":"adjective"},
{"token":"\n"},
{"token":"astronomer" , "lemma":"astronomer" , "pos":"noun"},
{"token":" "},
{"token":"Edmond Halley" , "lemma":"Edmond Halley" , "pos":"noun" , "ne": {"class": "person", "id": "http://en.wikipedia.org/wiki/Edmond_Halley"}},
{"token":", "},
{"token":"after" , "lemma":"after" , "pos":"preposition"},
{"token":" "},
{"token":"whom" , "lemma":"whom" , "pos":"pronoun"},
{"token":" "},
{"token":"it" , "lemma":"it" , "pos":"pronoun"},
{"token":" "},
{"token":"is named" , "lemma":"name" , "pos":"verb"},
{"token":". "}
],[
{"token":"Halley's Comet" , "lemma":"Halley's Comet", "pos":"noun" , "ne": {"class": "astronomical_object", "id": "http://en.wikipedia.org/wiki/Halley%27s_Comet"}},
{"token":" "},
{"token":"last" , "lemma":"last" , "pos":"adverb"},
{"token":" "},
{"token":"appeared" , "lemma":"appear" , "pos":"verb"},
{"token":" "},
{"token":"in" , "lemma":"in" , "pos":"preposition"},
{"token":" "},
{"token":"the" , "lemma":"the" , "pos":"article"},
{"token":" "},
{"token":"inner" , "lemma":"inner" , "pos":"adjective"},
{"token":" "},
{"token":"Solar System" , "lemma":"Solar System" , "pos":"noun" , "ne": {"class": "astronomical_object", "id": "http://en.wikipedia.org/wiki/Solar_System"}},
{"token":" "},
{"token":"in" , "lemma":"in" , "pos":"preposition"},
{"token":" "},
{"token":"1986" , "lemma":"1986" , "pos":"noun" , "ne": {"class": "date", "id": "1986"}},
{"token":" "},
{"token":"and" , "lemma":"and" , "pos":"conjunction"},
{"token":" "},
{"token":"will appear" , "lemma":"appear" , "pos":"verb"},
{"token":" "},
{"token":"in" , "lemma":"in" , "pos":"preposition"},
{"token":" "},
{"token":"2061" , "lemma":"2061" , "pos":"noun" , "ne": {"class": "date", "id": "2061"}},
{"token":".\n"}
]
]
window.main = () ->
d3.json 'halley.json', (error, corpus) ->
return console.warn(error) if error
# pos colors
pos_color = d3.scale.ordinal()
.domain(['noun','verb','adjective','adverb','pronoun','conjunction','preposition','article'])
.range(['#335BE2','#EF1D84','#FFBA1F','#57BF00','#24A6DE','#CCC','#D197C4','#CCC'])
vis = d3.select('body')
sentences = vis.selectAll('.sentence')
.data(corpus)
.enter().append('span')
.attr('class', 'sentence')
sentences.selectAll('.token')
.data((d) -> d)
.enter().append('span')
.attr('class', 'token')
.html((d) -> d.token.replace /\n/g, '<br/>')
.filter((d) -> d.lemma?)
.append('span')
.attr('class', 'lemma')
.text((d) -> d.lemma)
.style('border-top', (d) -> "2px solid #{pos_color(d.pos)}")
.style('color', (d) -> pos_color(d.pos))
.token {
font-family: sans-serif;
font-size: 10pt;
}
.sentence {
padding: 2px;
}
.sentence::before {
content: "■ ";
color: #aaaaaa;
}
.sentence {
line-height: 2.5em;
}
.token {
position: relative;
top: 0;
left: 0;
line-height: 1em;
}
.lemma {
position: absolute;
top: 18px;
left: 0;
text-align: center;
font-size: 0.8em;
width: 100%;
}
body {
padding: 40px;
}
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<title>Linguistic annotations</title>
<link type="text/css" href="index.css" rel="stylesheet"/>
<script src="http://d3js.org/d3.v3.min.js"></script>
<script src="index.js"></script>
</head>
<body onload="main()"></body>
</html>
(function() {
window.main = function() {
return d3.json('halley.json', function(error, corpus) {
var pos_color, sentences, vis;
if (error) return console.warn(error);
pos_color = d3.scale.ordinal().domain(['noun', 'verb', 'adjective', 'adverb', 'pronoun', 'conjunction', 'preposition', 'article']).range(['#335BE2', '#EF1D84', '#FFBA1F', '#57BF00', '#24A6DE', '#CCC', '#D197C4', '#CCC']);
vis = d3.select('body');
sentences = vis.selectAll('.sentence').data(corpus).enter().append('span').attr('class', 'sentence');
return sentences.selectAll('.token').data(function(d) {
return d;
}).enter().append('span').attr('class', 'token').html(function(d) {
return d.token.replace(/\n/g, '<br/>');
}).filter(function(d) {
return d.lemma != null;
}).append('span').attr('class', 'lemma').text(function(d) {
return d.lemma;
}).style('border-top', function(d) {
return "2px solid " + (pos_color(d.pos));
}).style('color', function(d) {
return pos_color(d.pos);
});
});
};
}).call(this);
.token
font-family: sans-serif
font-size: 10pt
.sentence
padding: 2px
.sentence::before
content: ''
color: #AAA
// lemma
.sentence
line-height: 2.5em
.token
position: relative
top: 0
left: 0
line-height: 1em
.lemma
position: absolute
top: 18px
left: 0
text-align: center
font-size: 0.8em
width: 100%
// bl.ocks
body
padding: 40px
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment