Patrick O. Perry, NYU Stern School of Business
We will use the coreNLP
package, which requires the latest version of Java.
To get this working On Mac OS, you need to run the following steps:
Install the latest Java SE Development Kit
Re-configure the R java settings by running the following command from
Terminal.app
:
R CMD javareconf
In R, install or re-install rJava
from source:
install.packages('rJava',,'http://www.rforge.net/')
Then, in a terminal, run
sudo ln -f -s $(/usr/libexec/java_home)/jre/lib/server/libjvm.dylib /usr/lib
These instructions were culled from stackoverflow.com/a/32544358 and stackoverflow.com/a/31039105.
We will use the following R packages.
library("dplyr")
library("jsonlite")
library("coreNLP")
library("Matrix")
library("NLP")
library("openNLP")
library("tm")
To ensure consistent runs, we set the seed before performing any analysis.
set.seed(0)
We will be using the Manually Annotated Sub-Corpus (MASC) from the American National Corpus:
masc <- jsonlite::stream_in(file("anc-masc.json"), verbose=FALSE) # raw text
sent <- jsonlite::stream_in(file("anc-masc-s.json"), verbose=FALSE) # sentence boundaries
Form all unigram and bigram counts.
BigramTokenizer <- function(x) {
unlist(lapply(NLP::ngrams(NLP::words(x), 2), paste, collapse = " "),
use.names = FALSE)
}
corpus <- VCorpus(VectorSource(masc$text))
control <- list(tolower = TRUE, removePunctuation = TRUE,
removeNumbers = TRUE, wordLengths=c(1, Inf))
dtm <- DocumentTermMatrix(corpus, control=c(control))
unigram <- sparseMatrix(dtm$i, dtm$j, x = dtm$v, dim=dim(dtm),
dimnames=dimnames(dtm))
dtm <- DocumentTermMatrix(corpus,
control=c(control, tokenize = BigramTokenizer))
bigram <- sparseMatrix(dtm$i, dtm$j, x = dtm$v, dim=dim(dtm),
dimnames=dimnames(dtm))
(n12 <- sum(bigram[,"new york"]))
[1] 124
(n1 <- sum(unigram[,"new"]))
[1] 794
(n2 <- sum(unigram[,"york"]))
[1] 149
(n <- sum(unigram))
[1] 477813
# null hypothesis: P(york|new) = P(york|-new)
p = n2 / n
dev0 <- -2 * (dbinom(n12, n1, p, log=TRUE)
+ dbinom(n2 - n12, n - n1, p, log=TRUE))
# alternative hypothesis: P(york|new) > P(york|-new)
p1 <- n12/n1
p2 <- (n2 - n12)/(n - n1)
if (p1 <= p2) {
dev1 <- dev0
} else {
dev1 <- -2 * (dbinom(n12, n1, p1, log=TRUE)
+ dbinom(n2 - n12, n - n1, p2, log=TRUE))
}
(chisq <- dev0 - dev1)
[1] 1472.869
(pval <- pchisq(chisq, df=1, lower.tail = FALSE))
[1] 3.063207e-322
ug <- colnames(unigram)
bg <- colnames(bigram)
chisq <- numeric(length(bg))
n12 <- colSums(bigram)
w1 <- numeric(length(bg))
w2 <- numeric(length(bg))
words <- strsplit(bg, " ")
words <- lapply(words, function(w) if (length(w) == 2) w else c(NA, NA))
w <- matrix(match(unlist(words), colnames(unigram)), ncol=2, byrow=TRUE)
ok <- !is.na(w[,1]) & !is.na(w[,2])
n1 <- rep(NA, length(bg))
n2 <- rep(NA, length(bg))
n1[ok] <- colSums(unigram)[w[ok,1]]
n2[ok] <- colSums(unigram)[w[ok,2]]
n <- sum(unigram)
colloc <- data_frame(bigram=bg, n1, n2, n12)
colloc$chisq <- with(colloc, {
# null hypothesis: P(w2|w1) = P(w2|-w1)
p = n2 / n
dev0 <- -2 * (dbinom(n12, n1, p, log=TRUE)
+ dbinom(n2 - n12, n - n1, p, log=TRUE))
# alt hypothesis: P(w2|w1) > P(w2|-w1)
p1 <- n12/n1
p2 <- (n2 - n12)/(n - n1)
dev1 <- -2 * (dbinom(n12, n1, p1, log=TRUE)
+ dbinom(n2 - n12, n - n1, p2, log=TRUE))
ifelse(p1 <= p2, 0, dev0 - dev1)
})
colloc$pval <- pchisq(colloc$chisq, df=1, lower.tail = FALSE)
print(n=100, colloc %>% arrange(desc(chisq)))
Source: local data frame [236,292 x 6]
bigram n1 n2 n12 chisq
1 of the 12003 26107 2804 4483.7424
2 in the 8358 26107 2173 3874.0079
3 jack sparrow 464 231 222 3132.8576
4 united states 221 275 179 2611.6980
5 i think 6664 729 415 2583.9190
6 to be 13510 2521 754 2438.7978
7 tz id 150 295 150 2310.9826
8 hong kong 172 127 127 2147.3793
9 going to 525 13510 376 2073.7134
10 if you 1406 4777 381 1916.3666
11 i dont 6664 717 337 1915.5133
12 will be 1651 2521 321 1795.3247
13 i am 6664 407 264 1742.4935
14 have been 2349 762 243 1659.6422
15 it was 4059 3042 410 1587.5769
16 it is 4059 5352 497 1567.0414
17 on the 3408 26107 887 1552.6807
18 you know 4777 895 287 1549.4260
19 has been 964 762 190 1544.3022
20 elizabeth swann 163 96 93 1525.8272
21 new york 794 149 124 1472.8689
22 will turner 1651 149 135 1448.8616
23 more than 1000 664 173 1406.2400
24 the same 26107 300 269 1370.6457
25 would be 1213 2521 235 1304.2399
26 las vegas 69 68 64 1203.2063
27 this is 2781 5352 363 1182.1943
28 want to 385 13510 233 1157.6046
29 we have 2475 2349 257 1130.1066
30 do you 1309 4777 257 1104.5186
31 postal service 140 257 82 1074.1557
32 able to 151 13510 149 1043.0517
33 a lot 10456 168 150 1035.1055
34 i was 6664 3042 373 1015.3806
35 davy jones 58 117 58 999.7227
36 there are 1115 2316 185 993.2465
37 dont know 717 895 131 985.5995
38 the first 26107 582 302 984.2828
39 vice president 87 373 73 983.2494
40 i mean 6664 257 153 966.8531
41 you can 4777 1172 226 961.2615
42 to make 13510 534 234 957.5691
43 at least 2229 108 95 944.6468
44 jeffery clark 68 79 55 944.2783
45 let me 222 1303 103 918.6760
46 thank you 142 4777 114 912.2991
47 one of 1454 12003 323 904.4365
48 there is 1115 5352 217 878.6147
49 contenttype textplain 46 46 44 872.8536
50 at the 2229 26107 536 854.4836
51 ian malcolm 58 65 48 851.7200
52 alan grant 74 126 55 850.7177
53 does not 297 2263 115 842.1561
54 as a 2885 10456 395 836.4532
55 to get 13510 678 234 824.0167
56 part of 238 12003 153 823.2677
57 gen horatio 42 40 40 814.9630
58 can be 1172 2521 167 813.1067
59 rather than 121 664 73 806.3949
60 a few 10456 176 131 804.8453
61 charsetusascii contenttransferencoding 39 43 39 785.6280
62 out of 1037 12003 258 781.7230
63 textplain charsetusascii 46 39 39 773.0086
64 horatio sanchez 40 55 40 766.5895
65 kind of 154 12003 124 764.5321
66 sonoma county 66 74 46 763.7113
67 we can 2475 1172 159 763.2663
68 for the 4247 26107 734 762.4947
69 the united 26107 221 170 756.3593
70 to do 13510 1309 290 746.9904
71 did not 514 2263 120 735.9569
72 health care 154 145 56 730.4872
73 i have 6664 2349 277 730.1319
74 they were 1508 1420 141 725.9595
75 messageid javamailevansthyme 38 35 35 715.5208
76 such as 346 2885 111 707.1761
77 should be 405 2521 112 705.1471
78 need to 404 13510 174 703.8359
79 he was 1984 3042 190 702.5083
80 be able 2521 151 86 699.2811
81 make sure 534 273 74 697.9953
82 from the 2196 26107 484 695.3911
83 et al 50 94 42 695.2869
84 may be 527 2521 119 694.9955
85 hong kongs 172 43 43 693.7095
86 is a 5352 10456 488 691.8020
87 flying dutchman 59 38 36 660.9524
88 cutler beckett 35 57 35 660.4837
89 lord cutler 58 35 35 658.6070
90 contenttransferencoding bit 43 121 40 655.7139
91 number of 195 12003 123 654.3068
92 but i 2018 6664 244 653.7842
93 as well 2885 750 130 649.9114
94 do not 1309 2263 142 640.7530
95 subject re 287 69 48 635.8111
96 look at 287 2229 92 633.1685
97 he said 1984 931 122 628.9446
98 there was 1115 3042 145 623.4056
99 president bush 373 86 51 620.9797
100 trying to 125 13510 103 620.2518
.. ... ... ... ... ...
Variables not shown: pval (dbl)
The Stanford CoreNLP library uses a hand-written sentence splitter with special cases for abbreviations like “Mr.”, “Ph.D.”, and
# initialize the coreNLP library; this fails unless you've already run downloadCoreNLP
coreNLP::initCoreNLP(annotators=c("tokenize", "ssplit"))
# annotate the sentence
s_core <-
do(masc %>% group_by(text_id), {
anno <- coreNLP::annotateString(.$text)
# extract the token boundaries
tok <- coreNLP::getToken(anno)
# determine the sentence boundaries
(tok %>% group_by(sentence)
%>% summarize(start = min(CharacterOffsetBegin),
end = max(CharacterOffsetEnd)))
}) %>% ungroup()
ator <- openNLP::Maxent_Sent_Token_Annotator(language="en")
s_open <-
do(masc %>% group_by(text_id), {
s <- NLP::as.String(.$text)
spans <- NLP::annotate(s, ator)
s_open <- as.data.frame(spans)
}) %>% ungroup()
The Punkt sentence splitter available in Python, but not in R. Here is code using NLTK 3.0:
import json
import nltk.data
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
infile = open('anc-masc.json', 'r')
outfile = open('anc-masc-punkt.json', 'w')
for line in infile:
obj = json.loads(line)
spans = sent_detector.span_tokenize(obj['text'])
for i,s in enumerate(spans):
json.dump({'text_id': obj['text_id'],
'sentence': i,
'begin': s[0],
'end': s[1]}, outfile)
outfile.write('\n')
outfile.close()
infile.close()
We can read the results from Punkt into R.
punkt <- jsonlite::stream_in(file("anc-masc-punkt.json"), verbose=FALSE)
# punkt gives [begin,end), in 0-based indices, and does not include trailing
# punctuation. The following command converts to 1-based indexing and
# [begin,end] span conventions.
s_punkt <- punkt %>% mutate(begin = begin + 1)
loss <- function(truth, est) {
tp <- length(intersect(truth, est)) # true positives
fp <- length(est) - tp # false positives
fn <- length(truth) - tp # false negatives
tn <- max(truth) - (tp + fp + fn) # true negatives
precision <- tp / (tp + fp)
recall <- tp / (tp + fn)
data_frame(precision, recall)
}
results <-
do(sent %>% group_by(text_id), {
tid <- .$text_id[[1]]
truth <- .$end
est_core <- (s_core %>% filter(text_id == tid))$end
est_open <- (s_open %>% filter(text_id == tid))$end
est_punkt <- (s_punkt %>% filter(text_id == tid))$end
l_core <- loss(truth, est_core)
l_open <- loss(truth, est_open)
l_punkt <- loss(truth, est_punkt)
data_frame(text_id = tid,
precision_core = l_core$precision, recall_core = l_core$recall,
precision_open = l_open$precision, recall_open = l_open$recall,
precision_punkt = l_punkt$precision, recall_punkt = l_punkt$recall)
}) %>% ungroup()
(results %>% left_join(masc, on="text_id")
%>% group_by(mode, class)
%>% summarize(precision_core = median(precision_core),
precision_open = median(precision_open),
precision_punkt = median(precision_punkt),
recall_core = median(recall_core),
recall_open = median(recall_open),
recall_punkt = median(recall_punkt)))
Source: local data frame [19 x 8]
Groups: mode
mode class precision_core precision_open precision_punkt
1 spoken court-transcript 0.9888920 0.9792127 0.9869341
2 spoken debate-transcript 0.9902222 0.9905660 0.9875362
3 spoken face-to-face 1.0000000 1.0000000 0.9846154
4 spoken telephone 1.0000000 1.0000000 0.0000000
5 written blog 0.9259259 0.9489796 0.9250000
6 written email 0.9000000 0.6666667 0.8153409
7 written essays 0.7871300 0.7993529 0.7754242
8 written ficlets 0.9516129 0.9627451 0.9546436
9 written fiction 0.9955357 0.9981343 0.9911111
10 written govt-docs 0.9601329 0.9701493 0.9532374
11 written jokes 0.8746439 0.9222670 0.8739516
12 written journal 0.9803922 0.9677419 0.9583333
13 written letters 1.0000000 0.9473684 0.9375000
14 written movie-script 0.9235276 0.8868550 0.9130547
15 written newspaper 1.0000000 1.0000000 0.8750000
16 written non-fiction 0.9178571 0.9446494 0.9370370
17 written technical 0.9682922 0.9537179 0.9442857
18 written travel-guides 1.0000000 0.9640288 0.9655172
19 written twitter 0.8560688 0.8378870 0.9348486
Variables not shown: recall_core (dbl), recall_open (dbl), recall_punkt (dbl)
with(results %>% left_join(masc, on="text_id")
%>% filter(mode == "written" & class != "email"), {
boxplot(recall_core, recall_open, recall_punkt,
names=c("CoreNLP", "OpenNLP", "Punkt"))
})
sessionInfo()
R version 3.2.3 (2015-12-10)
Platform: x86_64-apple-darwin13.4.0 (64-bit)
Running under: OS X 10.10.5 (Yosemite)
locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
attached base packages:
[1] methods stats graphics grDevices utils datasets base
other attached packages:
[1] tm_0.6-2 openNLP_0.2-5 NLP_0.1-8 Matrix_1.2-3
[5] coreNLP_0.4-1 jsonlite_0.9.16 dplyr_0.4.1 RColorBrewer_1.1-2
[9] knitr_1.12.3
loaded via a namespace (and not attached):
[1] Rcpp_0.11.5 magrittr_1.5 lattice_0.20-33
[4] R6_2.0.1 stringr_0.6.2 plyr_1.8.1
[7] tools_3.2.3 parallel_3.2.3 grid_3.2.3
[10] plotrix_3.6-1 DBI_0.3.1 lazyeval_0.1.10
[13] assertthat_0.1 digest_0.6.8 rJava_0.9-8
[16] openNLPdata_1.5.3-2 formatR_1.1 codetools_0.2-14
[19] evaluate_0.8 slam_0.1-32 XML_3.98-1.3