Skip to content

Instantly share code, notes, and snippets.

@syllog1sm
Last active September 19, 2024 23:54
Show Gist options
  • Save syllog1sm/10343947 to your computer and use it in GitHub Desktop.
Save syllog1sm/10343947 to your computer and use it in GitHub Desktop.
A simple Python dependency parser
"""A simple implementation of a greedy transition-based parser. Released under BSD license."""
from os import path
import os
import sys
from collections import defaultdict
import random
import time
import pickle
SHIFT = 0; RIGHT = 1; LEFT = 2;
MOVES = (SHIFT, RIGHT, LEFT)
START = ['-START-', '-START2-']
END = ['-END-', '-END2-']
class DefaultList(list):
"""A list that returns a default value if index out of bounds."""
def __init__(self, default=None):
self.default = default
list.__init__(self)
def __getitem__(self, index):
try:
return list.__getitem__(self, index)
except IndexError:
return self.default
class Parse(object):
def __init__(self, n):
self.n = n
self.heads = [None] * (n-1)
self.labels = [None] * (n-1)
self.lefts = []
self.rights = []
for i in range(n+1):
self.lefts.append(DefaultList(0))
self.rights.append(DefaultList(0))
def add(self, head, child, label=None):
self.heads[child] = head
self.labels[child] = label
if child < head:
self.lefts[head].append(child)
else:
self.rights[head].append(child)
class Parser(object):
def __init__(self, load=True):
model_dir = os.path.dirname(__file__)
self.model = Perceptron(MOVES)
if load:
self.model.load(path.join(model_dir, 'parser.pickle'))
self.tagger = PerceptronTagger(load=load)
self.confusion_matrix = defaultdict(lambda: defaultdict(int))
def save(self):
self.model.save(path.join(os.path.dirname(__file__), 'parser.pickle'))
self.tagger.save()
def parse(self, words):
n = len(words)
i = 2; stack = [1]; parse = Parse(n)
tags = self.tagger.tag(words)
while stack or (i+1) < n:
features = extract_features(words, tags, i, n, stack, parse)
scores = self.model.score(features)
valid_moves = get_valid_moves(i, n, len(stack))
guess = max(valid_moves, key=lambda move: scores[move])
i = transition(guess, i, stack, parse)
return tags, parse.heads
def train_one(self, itn, words, gold_tags, gold_heads):
n = len(words)
i = 2; stack = [1]; parse = Parse(n)
tags = self.tagger.tag(words)
while stack or (i + 1) < n:
features = extract_features(words, tags, i, n, stack, parse)
scores = self.model.score(features)
valid_moves = get_valid_moves(i, n, len(stack))
gold_moves = get_gold_moves(i, n, stack, parse.heads, gold_heads)
guess = max(valid_moves, key=lambda move: scores[move])
assert gold_moves
best = max(gold_moves, key=lambda move: scores[move])
self.model.update(best, guess, features)
i = transition(guess, i, stack, parse)
self.confusion_matrix[best][guess] += 1
return len([i for i in range(n-1) if parse.heads[i] == gold_heads[i]])
def transition(move, i, stack, parse):
if move == SHIFT:
stack.append(i)
return i + 1
elif move == RIGHT:
parse.add(stack[-2], stack.pop())
return i
elif move == LEFT:
parse.add(i, stack.pop())
return i
assert move in MOVES
def get_valid_moves(i, n, stack_depth):
moves = []
if (i+1) < n:
moves.append(SHIFT)
if stack_depth >= 2:
moves.append(RIGHT)
if stack_depth >= 1:
moves.append(LEFT)
return moves
def get_gold_moves(n0, n, stack, heads, gold):
def deps_between(target, others, gold):
for word in others:
if gold[word] == target or gold[target] == word:
return True
return False
valid = get_valid_moves(n0, n, len(stack))
if not stack or (SHIFT in valid and gold[n0] == stack[-1]):
return [SHIFT]
if gold[stack[-1]] == n0:
return [LEFT]
costly = set([m for m in MOVES if m not in valid])
# If the word behind s0 is its gold head, Left is incorrect
if len(stack) >= 2 and gold[stack[-1]] == stack[-2]:
costly.add(LEFT)
# If there are any dependencies between n0 and the stack,
# pushing n0 will lose them.
if SHIFT not in costly and deps_between(n0, stack, gold):
costly.add(SHIFT)
# If there are any dependencies between s0 and the buffer, popping
# s0 will lose them.
if deps_between(stack[-1], range(n0+1, n-1), gold):
costly.add(LEFT)
costly.add(RIGHT)
return [m for m in MOVES if m not in costly]
def extract_features(words, tags, n0, n, stack, parse):
def get_stack_context(depth, stack, data):
if depth >= 3:
return data[stack[-1]], data[stack[-2]], data[stack[-3]]
elif depth >= 2:
return data[stack[-1]], data[stack[-2]], ''
elif depth == 1:
return data[stack[-1]], '', ''
else:
return '', '', ''
def get_buffer_context(i, n, data):
if i + 1 >= n:
return data[i], '', ''
elif i + 2 >= n:
return data[i], data[i + 1], ''
else:
return data[i], data[i + 1], data[i + 2]
def get_parse_context(word, deps, data):
if word == -1:
return 0, '', ''
deps = deps[word]
valency = len(deps)
if not valency:
return 0, '', ''
elif valency == 1:
return 1, data[deps[-1]], ''
else:
return valency, data[deps[-1]], data[deps[-2]]
features = {}
# Set up the context pieces --- the word (W) and tag (T) of:
# S0-2: Top three words on the stack
# N0-2: First three words of the buffer
# n0b1, n0b2: Two leftmost children of the first word of the buffer
# s0b1, s0b2: Two leftmost children of the top word of the stack
# s0f1, s0f2: Two rightmost children of the top word of the stack
depth = len(stack)
s0 = stack[-1] if depth else -1
Ws0, Ws1, Ws2 = get_stack_context(depth, stack, words)
Ts0, Ts1, Ts2 = get_stack_context(depth, stack, tags)
Wn0, Wn1, Wn2 = get_buffer_context(n0, n, words)
Tn0, Tn1, Tn2 = get_buffer_context(n0, n, tags)
Vn0b, Wn0b1, Wn0b2 = get_parse_context(n0, parse.lefts, words)
Vn0b, Tn0b1, Tn0b2 = get_parse_context(n0, parse.lefts, tags)
Vn0f, Wn0f1, Wn0f2 = get_parse_context(n0, parse.rights, words)
_, Tn0f1, Tn0f2 = get_parse_context(n0, parse.rights, tags)
Vs0b, Ws0b1, Ws0b2 = get_parse_context(s0, parse.lefts, words)
_, Ts0b1, Ts0b2 = get_parse_context(s0, parse.lefts, tags)
Vs0f, Ws0f1, Ws0f2 = get_parse_context(s0, parse.rights, words)
_, Ts0f1, Ts0f2 = get_parse_context(s0, parse.rights, tags)
# Cap numeric features at 5?
# String-distance
Ds0n0 = min((n0 - s0, 5)) if s0 != 0 else 0
features['bias'] = 1
# Add word and tag unigrams
for w in (Wn0, Wn1, Wn2, Ws0, Ws1, Ws2, Wn0b1, Wn0b2, Ws0b1, Ws0b2, Ws0f1, Ws0f2):
if w:
features['w=%s' % w] = 1
for t in (Tn0, Tn1, Tn2, Ts0, Ts1, Ts2, Tn0b1, Tn0b2, Ts0b1, Ts0b2, Ts0f1, Ts0f2):
if t:
features['t=%s' % t] = 1
# Add word/tag pairs
for i, (w, t) in enumerate(((Wn0, Tn0), (Wn1, Tn1), (Wn2, Tn2), (Ws0, Ts0))):
if w or t:
features['%d w=%s, t=%s' % (i, w, t)] = 1
# Add some bigrams
features['s0w=%s, n0w=%s' % (Ws0, Wn0)] = 1
features['wn0tn0-ws0 %s/%s %s' % (Wn0, Tn0, Ws0)] = 1
features['wn0tn0-ts0 %s/%s %s' % (Wn0, Tn0, Ts0)] = 1
features['ws0ts0-wn0 %s/%s %s' % (Ws0, Ts0, Wn0)] = 1
features['ws0-ts0 tn0 %s/%s %s' % (Ws0, Ts0, Tn0)] = 1
features['wt-wt %s/%s %s/%s' % (Ws0, Ts0, Wn0, Tn0)] = 1
features['tt s0=%s n0=%s' % (Ts0, Tn0)] = 1
features['tt n0=%s n1=%s' % (Tn0, Tn1)] = 1
# Add some tag trigrams
trigrams = ((Tn0, Tn1, Tn2), (Ts0, Tn0, Tn1), (Ts0, Ts1, Tn0),
(Ts0, Ts0f1, Tn0), (Ts0, Ts0f1, Tn0), (Ts0, Tn0, Tn0b1),
(Ts0, Ts0b1, Ts0b2), (Ts0, Ts0f1, Ts0f2), (Tn0, Tn0b1, Tn0b2),
(Ts0, Ts1, Ts1))
for i, (t1, t2, t3) in enumerate(trigrams):
if t1 or t2 or t3:
features['ttt-%d %s %s %s' % (i, t1, t2, t3)] = 1
# Add some valency and distance features
vw = ((Ws0, Vs0f), (Ws0, Vs0b), (Wn0, Vn0b))
vt = ((Ts0, Vs0f), (Ts0, Vs0b), (Tn0, Vn0b))
d = ((Ws0, Ds0n0), (Wn0, Ds0n0), (Ts0, Ds0n0), (Tn0, Ds0n0),
('t' + Tn0+Ts0, Ds0n0), ('w' + Wn0+Ws0, Ds0n0))
for i, (w_t, v_d) in enumerate(vw + vt + d):
if w_t or v_d:
features['val/d-%d %s %d' % (i, w_t, v_d)] = 1
return features
class Perceptron(object):
def __init__(self, classes=None):
# Each feature gets its own weight vector, so weights is a dict-of-arrays
self.classes = classes
self.weights = {}
# The accumulated values, for the averaging. These will be keyed by
# feature/clas tuples
self._totals = defaultdict(int)
# The last time the feature was changed, for the averaging. Also
# keyed by feature/clas tuples
# (tstamps is short for timestamps)
self._tstamps = defaultdict(int)
# Number of instances seen
self.i = 0
def predict(self, features):
'''Dot-product the features and current weights and return the best class.'''
scores = self.score(features)
# Do a secondary alphabetic sort, for stability
return max(self.classes, key=lambda clas: (scores[clas], clas))
def score(self, features):
all_weights = self.weights
scores = dict((clas, 0) for clas in self.classes)
for feat, value in features.items():
if value == 0:
continue
if feat not in all_weights:
continue
weights = all_weights[feat]
for clas, weight in weights.items():
scores[clas] += value * weight
return scores
def update(self, truth, guess, features):
def upd_feat(c, f, w, v):
param = (f, c)
self._totals[param] += (self.i - self._tstamps[param]) * w
self._tstamps[param] = self.i
self.weights[f][c] = w + v
self.i += 1
if truth == guess:
return None
for f in features:
weights = self.weights.setdefault(f, {})
upd_feat(truth, f, weights.get(truth, 0.0), 1.0)
upd_feat(guess, f, weights.get(guess, 0.0), -1.0)
def average_weights(self):
for feat, weights in self.weights.items():
new_feat_weights = {}
for clas, weight in weights.items():
param = (feat, clas)
total = self._totals[param]
total += (self.i - self._tstamps[param]) * weight
averaged = round(total / float(self.i), 3)
if averaged:
new_feat_weights[clas] = averaged
self.weights[feat] = new_feat_weights
def save(self, path):
print "Saving model to %s" % path
pickle.dump(self.weights, open(path, 'w'))
def load(self, path):
self.weights = pickle.load(open(path))
class PerceptronTagger(object):
'''Greedy Averaged Perceptron tagger'''
model_loc = os.path.join(os.path.dirname(__file__), 'tagger.pickle')
def __init__(self, classes=None, load=True):
self.tagdict = {}
if classes:
self.classes = classes
else:
self.classes = set()
self.model = Perceptron(self.classes)
if load:
self.load(PerceptronTagger.model_loc)
def tag(self, words, tokenize=True):
prev, prev2 = START
tags = DefaultList('')
context = START + [self._normalize(w) for w in words] + END
for i, word in enumerate(words):
tag = self.tagdict.get(word)
if not tag:
features = self._get_features(i, word, context, prev, prev2)
tag = self.model.predict(features)
tags.append(tag)
prev2 = prev; prev = tag
return tags
def start_training(self, sentences):
self._make_tagdict(sentences)
self.model = Perceptron(self.classes)
def train(self, sentences, save_loc=None, nr_iter=5):
'''Train a model from sentences, and save it at save_loc. nr_iter
controls the number of Perceptron training iterations.'''
self.start_training(sentences)
for iter_ in range(nr_iter):
for words, tags in sentences:
self.train_one(words, tags)
random.shuffle(sentences)
self.end_training(save_loc)
def save(self):
# Pickle as a binary file
pickle.dump((self.model.weights, self.tagdict, self.classes),
open(PerceptronTagger.model_loc, 'wb'), -1)
def train_one(self, words, tags):
prev, prev2 = START
context = START + [self._normalize(w) for w in words] + END
for i, word in enumerate(words):
guess = self.tagdict.get(word)
if not guess:
feats = self._get_features(i, word, context, prev, prev2)
guess = self.model.predict(feats)
self.model.update(tags[i], guess, feats)
prev2 = prev; prev = guess
def load(self, loc):
w_td_c = pickle.load(open(loc, 'rb'))
self.model.weights, self.tagdict, self.classes = w_td_c
self.model.classes = self.classes
def _normalize(self, word):
if '-' in word and word[0] != '-':
return '!HYPHEN'
elif word.isdigit() and len(word) == 4:
return '!YEAR'
elif word[0].isdigit():
return '!DIGITS'
else:
return word.lower()
def _get_features(self, i, word, context, prev, prev2):
'''Map tokens into a feature representation, implemented as a
{hashable: float} dict. If the features change, a new model must be
trained.'''
def add(name, *args):
features[' '.join((name,) + tuple(args))] += 1
i += len(START)
features = defaultdict(int)
# It's useful to have a constant feature, which acts sort of like a prior
add('bias')
add('i suffix', word[-3:])
add('i pref1', word[0])
add('i-1 tag', prev)
add('i-2 tag', prev2)
add('i tag+i-2 tag', prev, prev2)
add('i word', context[i])
add('i-1 tag+i word', prev, context[i])
add('i-1 word', context[i-1])
add('i-1 suffix', context[i-1][-3:])
add('i-2 word', context[i-2])
add('i+1 word', context[i+1])
add('i+1 suffix', context[i+1][-3:])
add('i+2 word', context[i+2])
return features
def _make_tagdict(self, sentences):
'''Make a tag dictionary for single-tag words.'''
counts = defaultdict(lambda: defaultdict(int))
for sent in sentences:
for word, tag in zip(sent[0], sent[1]):
counts[word][tag] += 1
self.classes.add(tag)
freq_thresh = 20
ambiguity_thresh = 0.97
for word, tag_freqs in counts.items():
tag, mode = max(tag_freqs.items(), key=lambda item: item[1])
n = sum(tag_freqs.values())
# Don't add rare words to the tag dictionary
# Only add quite unambiguous words
if n >= freq_thresh and (float(mode) / n) >= ambiguity_thresh:
self.tagdict[word] = tag
def _pc(n, d):
return (float(n) / d) * 100
def train(parser, sentences, nr_iter):
parser.tagger.start_training(sentences)
for itn in range(nr_iter):
corr = 0; total = 0
random.shuffle(sentences)
for words, gold_tags, gold_parse, gold_label in sentences:
corr += parser.train_one(itn, words, gold_tags, gold_parse)
if itn < 5:
parser.tagger.train_one(words, gold_tags)
total += len(words)
print itn, '%.3f' % (float(corr) / float(total))
if itn == 4:
parser.tagger.model.average_weights()
print 'Averaging weights'
parser.model.average_weights()
def read_pos(loc):
for line in open(loc):
if not line.strip():
continue
words = DefaultList('')
tags = DefaultList('')
for token in line.split():
if not token:
continue
word, tag = token.rsplit('/', 1)
#words.append(normalize(word))
words.append(word)
tags.append(tag)
pad_tokens(words); pad_tokens(tags)
yield words, tags
def read_conll(loc):
for sent_str in open(loc).read().strip().split('\n\n'):
lines = [line.split() for line in sent_str.split('\n')]
words = DefaultList(''); tags = DefaultList('')
heads = [None]; labels = [None]
for i, (word, pos, head, label) in enumerate(lines):
words.append(intern(word))
#words.append(intern(normalize(word)))
tags.append(intern(pos))
heads.append(int(head) + 1 if head != '-1' else len(lines) + 1)
labels.append(label)
pad_tokens(words); pad_tokens(tags)
yield words, tags, heads, labels
def pad_tokens(tokens):
tokens.insert(0, '<start>')
tokens.append('ROOT')
def main(model_dir, train_loc, heldout_in, heldout_gold):
if not os.path.exists(model_dir):
os.mkdir(model_dir)
input_sents = list(read_pos(heldout_in))
parser = Parser(load=False)
sentences = list(read_conll(train_loc))
train(parser, sentences, nr_iter=15)
parser.save()
c = 0
t = 0
gold_sents = list(read_conll(heldout_gold))
t1 = time.time()
for (words, tags), (_, _, gold_heads, gold_labels) in zip(input_sents, gold_sents):
_, heads = parser.parse(words)
for i, w in list(enumerate(words))[1:-1]:
if gold_labels[i] in ('P', 'punct'):
continue
if heads[i] == gold_heads[i]:
c += 1
t += 1
t2 = time.time()
print 'Parsing took %0.3f ms' % ((t2-t1)*1000.0)
print c, t, float(c)/t
if __name__ == '__main__':
main(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4])
@Harryhaha
Copy link

Hi, thanks for your code, I am quite interested in it. Could you please clarify what the 4 command line parameters should be ? Thank you.

@ShrishtiAK
Copy link

Where can I get parser.py file (parsing english in 500 lines of python)?

@newoaixuoy
Copy link

In line 359, where is end_training() defined?

@lluisp
Copy link

lluisp commented Sep 18, 2017

I liked you post a lot, it is very clear and enlightening.

I have a question, though: I tried to train it with conll-X data (after guessing the parameters, their formats, and fixing the files... some doc here would be nice) but it always fails at "assert gold_moves" in line 84. The reason is that since the perceptron starts with random weights, during the first epoch it is very likely to get into a nonsense path and reach a configuration were all moves will be discarded.
Any suggestion how to handle this? Maybe use a static oracle during the first epochs?

@arjunmenon
Copy link

Hey @lluisp

(after guessing the parameters, their formats, and fixing the files

Can you share those parameters and the doc formats?

@hailiang-wang
Copy link

I am trying to make it work with https://github.com/UniversalDependencies/UD_English-EWT
But I came to some errors.
Check out my codes.
https://gist.github.com/Samurais/0dc04f265799731caebf80d9ebde6395

@devkatcybersecurity
Copy link

@Samurais did you figure out your mistake ?

@cadddr
Copy link

cadddr commented Jul 26, 2018

@lluisp got away with the following for line 84:
try: assert gold_moves
except: gold_moves = valid_moves
Though, haven't yet read about dynamic oracle so not sure if this ruins the training.

Edit: don't think this is a good idea. For now, simply skip iterations where this error happens. Thankfully, this error occurs only on a smaller subset.

@cadddr
Copy link

cadddr commented Jul 26, 2018

@arjunmenon it's
python parser.py model_dir wsj_train.dep wsj_train.pos wsj_test.dep

  • model_dir gets created if doesn't already exist

  • wsj.dep obtained by

  1. doing the following on the .mrg trebank files (see original blogpost: https://explosion.ai/blog/parsing-english-in-python)
for f in $1/*.mrg; do
    echo $f
    grep -v CODE $f > "$f.2"
    out="$f.dep"
    java -mx800m -cp "$scriptdir/*:" edu.stanford.nlp.trees.EnglishGrammaticalStructure \
-treeFile "$f.2" -basic -makeCopulaHead -conllx > $out
done
  1. merging them together in one file e.g., via cat combined/*dep > wsj.dep
  2. since newer stanford's output changed, you need to throw away some of the conll fields, changing line 477 to:
    for i, (_, word, _, pos, _, _, head, label, _, _) in enumerate(lines):
  • finally, .pos file merged in similar fashion, in addition needs brackers and "====" lines removed.

@cadddr
Copy link

cadddr commented Jul 27, 2018

@Samurais @Devendra491

Great find!
I managed to run the parser on UD data by doing the conversion to conll-x format first using the following script:
perl conllu_to_conllx.pl < file.conllu > file.conll
from https://github.com/UniversalDependencies/tools

@fancyerii
Copy link

where can I get some training samples to run the codes? I don't need large amount of data, But only enough for running it.

@fancyerii
Copy link

is your algorithm arc-eager or arc-standard?

@kennycoding
Copy link

Finally I get it working, to some extend. A few bumps I have get through (in addtion to @cadddr's comments)

  1. line 84, assert fail on some sentences, just ignore
            # assert gold_moves
            if not gold_moves:
                break

  1. line 481, there is off-by-one error, as the gold heads index starts from 1 rather than 0
            # heads.append(int(head) + 1 if head != '-1' else len(lines) + 1)
            heads.append(int(head) if head != '-1' else len(lines))
  1. pad tokens before calling the parser
    parser = Parser()
    tokens = sentence.split()
    pad_tokens(tokens)
    tags, heads = parser.parse(tokens)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment