1. 安装
  2. Chart Parsing
  3. demo - chart parser
    1. tree
    2. Well-Formed Substring Tables
    3. Charts
    4. earleychart demo
    5. Complex grammar and tokens

Python用自然语言处理工具nltk(Natural Language Toolkit, http://www.nltk.org/ )完成计算语言学中的Chart Parsing。

安装

  1. 使用pip安装

    http://www.nltk.org/install.html

    1
    pip install nltk

    或下载安装包安装( http://pypi.python.org/pypi/nltk

    1
    python setup.py install

  2. 需要安装NLTK Data( http://www.nltk.org/data.html )。在python中输入以下代码:

    1
    2
    import nltk
    nltk.download()

Chart Parsing

demo - chart parser

tree

Parse a sentence John ate the cat

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
import nltk

grammar = nltk.CFG.fromstring("""
S -> NP VP
VP -> V NP | V
NP -> NAME | ART N
NAME -> 'John'
V -> 'ate'
ART -> 'the'
N -> 'cat'
""")

tokens = ['John', 'ate', 'the', 'cat']

parser = nltk.ChartParser(grammar, trace=1)
for tree in parser.parse(tokens):
print(tree)
tree.draw()

parser = nltk.parse.chart.BottomUpChartParser(grammar, trace=1)
for tree in parser.parse(tokens):
print(tree)

parser = nltk.parse.earleychart.EarleyChartParser(grammar, trace=1)
for tree in parser.parse(tokens):
print(tree)

output

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
|.   John  .   ate   .   the   .   cat   .|
|[---------] . . .| [0:1] 'John'
|. [---------] . .| [1:2] 'ate'
|. . [---------] .| [2:3] 'the'
|. . . [---------]| [3:4] 'cat'
|[---------] . . .| [0:1] NAME -> 'John' *
|[---------] . . .| [0:1] NP -> NAME *
|[---------> . . .| [0:1] S -> NP * VP
|. [---------] . .| [1:2] V -> 'ate' *
|. [---------> . .| [1:2] VP -> V * NP
|. [---------] . .| [1:2] VP -> V *
|[-------------------] . .| [0:2] S -> NP VP *
|. . [---------] .| [2:3] ART -> 'the' *
|. . [---------> .| [2:3] NP -> ART * N
|. . . [---------]| [3:4] N -> 'cat' *
|. . [-------------------]| [2:4] NP -> ART N *
|. . [------------------->| [2:4] S -> NP * VP
|. [-----------------------------]| [1:4] VP -> V NP *
|[=======================================]| [0:4] S -> NP VP *
(S (NP (NAME John)) (VP (V ate) (NP (ART the) (N cat))))

|. John . ate . the . cat .|
|[---------] . . .| [0:1] 'John'
|. [---------] . .| [1:2] 'ate'
|. . [---------] .| [2:3] 'the'
|. . . [---------]| [3:4] 'cat'
|> . . . .| [0:0] NAME -> * 'John'
|[---------] . . .| [0:1] NAME -> 'John' *
|> . . . .| [0:0] NP -> * NAME
|[---------] . . .| [0:1] NP -> NAME *
|> . . . .| [0:0] S -> * NP VP
|[---------> . . .| [0:1] S -> NP * VP
|. > . . .| [1:1] V -> * 'ate'
|. [---------] . .| [1:2] V -> 'ate' *
|. > . . .| [1:1] VP -> * V NP
|. > . . .| [1:1] VP -> * V
|. [---------> . .| [1:2] VP -> V * NP
|. [---------] . .| [1:2] VP -> V *
|[-------------------] . .| [0:2] S -> NP VP *
|. . > . .| [2:2] ART -> * 'the'
|. . [---------] .| [2:3] ART -> 'the' *
|. . > . .| [2:2] NP -> * ART N
|. . [---------> .| [2:3] NP -> ART * N
|. . . > .| [3:3] N -> * 'cat'
|. . . [---------]| [3:4] N -> 'cat' *
|. . [-------------------]| [2:4] NP -> ART N *
|. . > . .| [2:2] S -> * NP VP
|. [-----------------------------]| [1:4] VP -> V NP *
|. . [------------------->| [2:4] S -> NP * VP
|[=======================================]| [0:4] S -> NP VP *
(S (NP (NAME John)) (VP (V ate) (NP (ART the) (N cat))))

|. John . ate . the . cat .|
|[---------] . . .| [0:1] 'John'
|. [---------] . .| [1:2] 'ate'
|. . [---------] .| [2:3] 'the'
|. . . [---------]| [3:4] 'cat'
|> . . . .| [0:0] S -> * NP VP
|> . . . .| [0:0] NP -> * NAME
|> . . . .| [0:0] NP -> * ART N
|> . . . .| [0:0] NAME -> * 'John'
|[---------] . . .| [0:1] NAME -> 'John' *
|[---------] . . .| [0:1] NP -> NAME *
|[---------> . . .| [0:1] S -> NP * VP
|. > . . .| [1:1] VP -> * V NP
|. > . . .| [1:1] VP -> * V
|. > . . .| [1:1] V -> * 'ate'
|. [---------] . .| [1:2] V -> 'ate' *
|. [---------> . .| [1:2] VP -> V * NP
|. [---------] . .| [1:2] VP -> V *
|[-------------------] . .| [0:2] S -> NP VP *
|. . > . .| [2:2] NP -> * NAME
|. . > . .| [2:2] NP -> * ART N
|. . > . .| [2:2] ART -> * 'the'
|. . [---------] .| [2:3] ART -> 'the' *
|. . [---------> .| [2:3] NP -> ART * N
|. . . > .| [3:3] N -> * 'cat'
|. . . [---------]| [3:4] N -> 'cat' *
|. . [-------------------]| [2:4] NP -> ART N *
|. [-----------------------------]| [1:4] VP -> V NP *
|[=======================================]| [0:4] S -> NP VP *
(S (NP (NAME John)) (VP (V ate) (NP (ART the) (N cat))))

Well-Formed Substring Tables

Parse a sentence John ate the cat

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
import nltk
grammar = nltk.CFG.fromstring("""
S -> NP VP
VP -> V NP | V
NP -> NAME | ART N
NAME -> 'John'
V -> 'ate'
ART -> 'the'
N -> 'cat'
""")

tokens = ['John', 'ate', 'the', 'cat']

def init_wfst(tokens, grammar):
numtokens = len(tokens)
wfst = [['.' for i in range(numtokens+1)] for j in range(numtokens+1)]
for i in range(numtokens):
productions = grammar.productions(rhs=tokens[i])
wfst[i][i+1] = productions[0].lhs()
return wfst

def display(wfst, tokens):
print('\nWFST ' + ' '.join([("%-4d" % i) for i in range(1, len(wfst))]))
for i in range(len(wfst)-1):
print_string = ("%d " % i)
for j in range(1, len(wfst)):
print_string += (" %-4s"% wfst[i][j])
print(print_string)

wfst0 = init_wfst(tokens, grammar)
display(wfst0, tokens)

output

1
2
3
4
5
WFST 1    2    3    4
0 NAME . . .
1 . V . .
2 . . ART .
3 . . . N

Charts

Parse a sentence John ate the cat

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
import nltk
grammar = nltk.CFG.fromstring("""
S -> NP VP
VP -> V NP | V
NP -> NAME | ART N
NAME -> 'John'
V -> 'ate'
ART -> 'the'
N -> 'cat'
""")

tokens = ['John', 'ate', 'the', 'cat']

def complete_wfst(wfst, tokens, trace=False):
index = {}
for prod in grammar.productions():
index[prod.rhs()] = prod.lhs()
numtokens = len(tokens)
for span in range(2, numtokens+1):
for start in range(numtokens+1-span):
end = start + span
for mid in range(start+1, end):
nt1, nt2 = wfst[start][mid], wfst[mid][end]
if (nt1,nt2) in index:
if trace:
print("[%s] %3s [%s] %3s [%s] ==> [%s] %3s [%s]" % \
(start, nt1, mid, nt2, end, start, index[(nt1,nt2)], end))
wfst[start][end] = index[(nt1,nt2)]
return wfst

wfst1 = complete_wfst(wfst0, tokens, trace=True)

output

1
2
[2] ART [3]   N [4] ==> [2]  NP [4]
[1] V [2] NP [4] ==> [1] VP [4]

earleychart demo

1
2
import nltk
nltk.parse.earleychart.demo()

Complex grammar and tokens

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
grammar = nltk.CFG.fromstring("""
S -> NP VP
PP -> P NP
NP -> Det N | Det N PP | 'I'
VP -> V NP | VP PP
Det -> 'an' | 'my'
N -> 'elephant' | 'pajamas'
V -> 'shot'
P -> 'in'
""")
tokens = ['I', 'shot', 'an', 'elephant', 'in', 'my', 'pajamas']

grammar = nltk.CFG.fromstring("""
S -> NP VP
NP -> ART ADJ N | ART N | ADJ N
VP -> AUX VP | V NP
ART -> 'the'
ADJ -> 'large'
N -> 'can' | 'hold' | 'water'
AUX -> 'can'
V -> 'can' | 'hold' | 'water'
""")
tokens = ['the', 'large', 'can', 'can', 'hold', 'the', 'water']