Created
September 18, 2014 04:20
-
-
Save rns/917559de6418516fc93a to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
0-25 R : Time flies like an arrow. <- S ::= NP VP period | |
0- 4 R : Time <- NP ::= NN | |
# token | |
0- 4 T : Time <- NN | |
# glade 0- 4: Time --- | |
- (NP (NN Time)) | |
5-24 R : flies like an arrow <- VP ::= VBZ PP | |
# token | |
5-10 T : flies <- VBZ | |
11-24 R : like an arrow <- PP ::= IN NP | |
# token | |
11-15 T : like <- IN | |
16-24 R : an arrow <- NP ::= DT NN | |
# token | |
16-18 T : an <- DT | |
# token | |
19-24 T : arrow <- NN | |
# glade 16-24: an arrow --- | |
- (NP (DT an) (NN arrow)) | |
# glade 11-24: like an arrow --- | |
- (PP (IN like) (NP (DT an) (NN arrow))) | |
# glade 5-24: flies like an arrow --- | |
- (VP (VBZ flies) (PP (IN like) (NP (DT an) (NN arrow)))) | |
# token | |
24-25 T : . <- period | |
0-10 R : Time flies <- NP ::= NN NNS | |
# token | |
0- 4 T : Time <- NN | |
# token | |
5-10 T : flies <- NNS | |
# glade 0-10: Time flies --- | |
- (NP (NN Time) (NNS flies)) | |
11-24 R : like an arrow <- VP ::= VBP NP | |
# token | |
11-15 T : like <- VBP | |
# glade 11-24: like an arrow --- | |
- (VP (VBP like) (NP (DT an) (NN arrow))) | |
# glade 0-25: Time flies like an arrow. --- | |
- |- | |
(S (NP (NN Time)) | |
(VP (VBZ flies) (PP (IN like) (NP (DT an) (NN arrow)))) | |
(period .)) | |
- |- | |
(S (NP (NN Time) (NNS flies)) | |
(VP (VBP like) (NP (DT an) (NN arrow))) | |
(period .)) | |
0-26 R : Fruit flies like a banana. <- S ::= NP VP period | |
0- 5 R : Fruit <- NP ::= NN | |
# token | |
0- 5 T : Fruit <- NN | |
# glade 0- 5: Fruit --- | |
- (NP (NN Fruit)) | |
6-25 R : flies like a banana <- VP ::= VBZ PP | |
# token | |
6-11 T : flies <- VBZ | |
12-25 R : like a banana <- PP ::= IN NP | |
# token | |
12-16 T : like <- IN | |
17-25 R : a banana <- NP ::= DT NN | |
# token | |
17-18 T : a <- DT | |
# token | |
19-25 T : banana <- NN | |
# glade 17-25: a banana --- | |
- (NP (DT a) (NN banana)) | |
# glade 12-25: like a banana --- | |
- (PP (IN like) (NP (DT a) (NN banana))) | |
# glade 6-25: flies like a banana --- | |
- (VP (VBZ flies) (PP (IN like) (NP (DT a) (NN banana)))) | |
# token | |
25-26 T : . <- period | |
0-11 R : Fruit flies <- NP ::= NN NNS | |
# token | |
0- 5 T : Fruit <- NN | |
# token | |
6-11 T : flies <- NNS | |
# glade 0-11: Fruit flies --- | |
- (NP (NN Fruit) (NNS flies)) | |
12-25 R : like a banana <- VP ::= VBP NP | |
# token | |
12-16 T : like <- VBP | |
# glade 12-25: like a banana --- | |
- (VP (VBP like) (NP (DT a) (NN banana))) | |
# glade 0-26: Fruit flies like a banana. --- | |
- |- | |
(S (NP (NN Fruit)) | |
(VP (VBZ flies) (PP (IN like) (NP (DT a) (NN banana)))) | |
(period .)) | |
- |- | |
(S (NP (NN Fruit) (NNS flies)) | |
(VP (VBP like) (NP (DT a) (NN banana))) | |
(period .)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl | |
# Copyright 2013 Jeffrey Kegler | |
# This file is part of Marpa::R2. Marpa::R2 is free software: you can | |
# redistribute it and/or modify it under the terms of the GNU Lesser | |
# General Public License as published by the Free Software Foundation, | |
# either version 3 of the License, or (at your option) any later version. | |
# | |
# Marpa::R2 is distributed in the hope that it will be useful, | |
# but WITHOUT ANY WARRANTY; without even the implied warranty of | |
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
# Lesser General Public License for more details. | |
# | |
# You should have received a copy of the GNU Lesser | |
# General Public License along with Marpa::R2. If not, see | |
# http://www.gnu.org/licenses/. | |
# This example parses ambiguous English sentences. The target annotation | |
# is Penn Treebank's syntactic bracketing tags. For details, see | |
# http://www.cis.upenn.edu/~treebank/ | |
# This example originally came from Ralf Muschall. Ruslan Shvedov | |
# reworked my implementation, converting it to the SLIF and | |
# Penn Treebank. Ruslan and Ralf clearly know English grammar better than | |
# most of us native speakers. | |
# 'time', 'fruit', and 'flies' can be nouns or verbs, 'like' can be | |
# a preposition or a verb. This creates syntactic ambiguity shown | |
# in the parse results. | |
# Modifier nouns are not tagged or lexed as adjectives (JJ), because | |
# "Nouns that are used as modifiers, whether in isolation or in sequences, | |
# should be tagged as nouns (NN, NNS) rather than as adjectives (JJ)." | |
# -- ftp://ftp.cis.upenn.edu/pub/treebank/doc/tagguide.ps.gz | |
# The saying "time flies like an arrow; fruit flies like a banana" | |
# is attributed to Groucho Marx, but there is no reason to believe | |
# he ever said it. Apparently, the saying | |
# first appeared on the Usenet on net.jokes in 1982. | |
# I've documented this whole thing on Wikipedia: | |
# http://en.wikipedia.org/wiki/Time_flies_like_an_arrow | |
# | |
# The permalink is: | |
# http://en.wikipedia.org/w/index.php?title=Time_flies_like_an_arrow&oldid=311163283 | |
use 5.010; | |
use strict; | |
use warnings; | |
use Marpa::R2; | |
use YAML; | |
my $grammar = Marpa::R2::Scanless::G->new( | |
{ | |
source => \(<<'END_OF_SOURCE'), | |
:default ::= action => [ name, values ] | |
lexeme default = action => [ name, value ] | |
S ::= NP VP period | |
NP ::= NN | |
| JJ NN | |
| DT NN | |
| NN NNS | |
VP ::= VBP NP | |
| VBP PP | |
| VBZ PP | |
| VBZ RB | |
PP ::= IN NP | |
period ~ '.' | |
:discard ~ whitespace | |
whitespace ~ [\s]+ | |
DT ~ 'a' | 'an' | |
NN ~ 'arrow' | 'banana' | |
NNS ~ 'flies' | |
NNS ~ 'bananas' | |
VBZ ~ 'flies' | |
NN ~ 'fruit':i | |
VBP ~ 'fruit':i | |
IN ~ 'like' | |
VBP ~ 'like' | |
NN ~ 'time':i | |
VBP ~ 'time':i | |
RB ~ 'fast' | |
VBP ~ 'fast' | |
JJ ~ 'fast' | |
NN ~ 'fast' | |
VBP ~ 'spoil' | |
END_OF_SOURCE | |
} | |
); | |
my $expected = <<'EOS'; | |
[To be written] | |
EOS | |
my $paragraph = <<END_OF_PARAGRAPH; | |
Time flies like an arrow. | |
Time flies fast. | |
Fruit flies like a banana. | |
Fruit flies spoil banana. | |
END_OF_PARAGRAPH | |
my $recce; | |
for my $sentence (split /\n/, $paragraph){ | |
$recce = Marpa::R2::Scanless::R->new( { grammar => $grammar } ); | |
$recce->read( \$sentence ); | |
if ( $recce->ambiguity_metric() > 1 ) { | |
my $asf = Marpa::R2::ASF->new( { slr => $recce } ); | |
die 'No ASF' if not defined $asf; | |
$asf->traverse( {}, \&traverser ); | |
} | |
} | |
sub all_choices{ | |
my $glade = shift; | |
# The results at each position are a list of choices, so | |
# to produce a new result list, we need to take a Cartesian | |
# product of all the choices | |
my $length = $glade->rh_length(); | |
my @results = ( [] ); | |
for my $rh_ix ( 0 .. $length - 1 ) { | |
my @new_results = (); | |
for my $old_result (@results) { | |
my $child_value = $glade->rh_value($rh_ix); | |
for my $new_value ( @{ $child_value } ) { | |
push @new_results, [ @{$old_result}, $new_value ]; | |
} | |
} | |
@results = @new_results; | |
} ## end for my $rh_ix ( 0 .. $length - 1 ) | |
return @results; | |
} | |
sub traverser { | |
my ($glade, $scratch) = @_; | |
# get rule and symbol IDs | |
my $rule_id = $glade->rule_id(); | |
my $symbol_id = $glade->symbol_id(); | |
my $symbol_name = $grammar->symbol_name($symbol_id); | |
my $display_form = $grammar->symbol_display_form($symbol_id); | |
# get span | |
my ( $start, $length ) = $glade->span(); | |
my $literal = $glade->literal(); | |
# set glade's prefix | |
my $prefix = sprintf "%3s-%2s", $start, $start + $length; | |
# check for tokens | |
if (not defined $rule_id){ | |
say "# token\n", $prefix, " T : ", sprintf("%-30s", $literal), " <- $display_form"; | |
return ["($symbol_name $literal)"]; | |
} | |
# get rule expansion | |
unless ( $symbol_name eq '[:start]' ){ | |
my ($lhs_id, @rhs_ids) = | |
map { $grammar->symbol_display_form($_) } | |
$grammar->rule_expand($rule_id); | |
my $rule = "$lhs_id ::= " . join q{ }, @rhs_ids; | |
# rule | |
say $prefix, " R : ", sprintf("%-30s", $literal), " <- $rule"; | |
} | |
my @return_value; | |
CHOICE: while (1) { | |
# The results at each position are a list of choices, so | |
# to produce a new result list, we need to take a Cartesian | |
# product of all the choices | |
my @results = all_choices( $glade ); | |
# Special case for the start rule | |
if ( $symbol_name eq '[:start]' ) { | |
my $retval = [ map { join q{}, @{$_} } @results ]; | |
# say Dump $retval; | |
return $retval; | |
} | |
# Now we have a list of choices, as a list of lists. Each sub list | |
# is a list of tokens, which we need to join into | |
# a single token. The result will be to collapse | |
# one level of lists, and leave us with a list of Penn-tagged | |
# elements | |
my $join_ws = q{ }; | |
$join_ws = qq{\n } if $symbol_name eq 'S'; | |
my $push_val = [ map { '(' . $symbol_name . q{ } . ( join $join_ws, @{$_} ) . ')' } | |
@results ]; | |
# say "# choice$prefix ", Dump $push_val; | |
push @return_value, @$push_val; | |
# Look at the next alternative in this glade, or end the | |
# loop if there is none | |
last CHOICE if not defined $glade->next(); | |
} ## end CHOICE: while (1) | |
# Return the list of tokens for this glade | |
say "# glade $prefix: $literal ", Dump \@return_value; | |
return \@return_value; | |
} ## end sub traverser |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment