Segmenting Text

Patrick O. Perry, NYU Stern School of Business

Preliminaries

Java

We will use the coreNLP package, which requires the latest version of Java. To get this working On Mac OS, you need to run the following steps:

Install the latest Java SE Development Kit
Re-configure the R java settings by running the following command from Terminal.app:
```
R CMD javareconf
```

In R, install or re-install rJava from source:

install.packages('rJava',,'http://www.rforge.net/')

Then, in a terminal, run

sudo ln -f -s $(/usr/libexec/java_home)/jre/lib/server/libjvm.dylib /usr/lib

These instructions were culled from stackoverflow.com/a/32544358 and stackoverflow.com/a/31039105.

Computing environment

We will use the following R packages.

library("dplyr")
library("jsonlite")
library("coreNLP")
library("Matrix")
library("NLP")
library("openNLP")
library("tm")

To ensure consistent runs, we set the seed before performing any analysis.

set.seed(0)

Data

We will be using the Manually Annotated Sub-Corpus (MASC) from the American National Corpus:

masc <- jsonlite::stream_in(file("anc-masc.json"), verbose=FALSE)   # raw text
sent <- jsonlite::stream_in(file("anc-masc-s.json"), verbose=FALSE) # sentence boundaries

Collocations

Form all unigram and bigram counts.

BigramTokenizer <- function(x) {
    unlist(lapply(NLP::ngrams(NLP::words(x), 2), paste, collapse = " "),
           use.names = FALSE)
}

corpus <- VCorpus(VectorSource(masc$text))
control <- list(tolower = TRUE, removePunctuation = TRUE,
                removeNumbers = TRUE, wordLengths=c(1, Inf))

dtm <- DocumentTermMatrix(corpus, control=c(control))
unigram <- sparseMatrix(dtm$i, dtm$j, x = dtm$v, dim=dim(dtm),
                        dimnames=dimnames(dtm))

dtm <- DocumentTermMatrix(corpus,
                          control=c(control, tokenize = BigramTokenizer))
bigram <- sparseMatrix(dtm$i, dtm$j, x = dtm$v, dim=dim(dtm),
                       dimnames=dimnames(dtm))

Example: “new york”

(n12 <- sum(bigram[,"new york"]))

[1] 124

(n1 <- sum(unigram[,"new"]))

[1] 794

(n2 <- sum(unigram[,"york"]))

[1] 149

(n <- sum(unigram))

[1] 477813

# null hypothesis: P(york|new) = P(york|-new)
p = n2 / n
dev0 <- -2 * (dbinom(n12, n1, p, log=TRUE)
              + dbinom(n2 - n12, n - n1, p, log=TRUE))

# alternative hypothesis: P(york|new) > P(york|-new)
p1 <- n12/n1
p2 <- (n2 - n12)/(n - n1)
if (p1 <= p2) {
    dev1 <- dev0
} else {
    dev1 <- -2 * (dbinom(n12, n1, p1, log=TRUE)
                  + dbinom(n2 - n12, n - n1, p2, log=TRUE))
}

(chisq <- dev0 - dev1)

[1] 1472.869

(pval <- pchisq(chisq, df=1, lower.tail = FALSE))

[1] 3.063207e-322

All collocations

ug <- colnames(unigram)
bg <- colnames(bigram)
chisq <- numeric(length(bg))
n12 <- colSums(bigram)
w1 <- numeric(length(bg))
w2 <- numeric(length(bg))

words <- strsplit(bg, " ")
words <- lapply(words, function(w) if (length(w) == 2) w else c(NA, NA))

w <- matrix(match(unlist(words), colnames(unigram)), ncol=2, byrow=TRUE)

ok <- !is.na(w[,1]) & !is.na(w[,2])
n1 <- rep(NA, length(bg))
n2 <- rep(NA, length(bg))
n1[ok] <- colSums(unigram)[w[ok,1]]
n2[ok] <- colSums(unigram)[w[ok,2]]
n <- sum(unigram)

colloc <- data_frame(bigram=bg, n1, n2, n12)

colloc$chisq <- with(colloc, {
    # null hypothesis: P(w2|w1) = P(w2|-w1)
    p = n2 / n
    dev0 <- -2 * (dbinom(n12, n1, p, log=TRUE)
                  + dbinom(n2 - n12, n - n1, p, log=TRUE))

    # alt hypothesis: P(w2|w1) > P(w2|-w1)
    p1 <- n12/n1
    p2 <- (n2 - n12)/(n - n1)
    dev1 <- -2 * (dbinom(n12, n1, p1, log=TRUE)
                  + dbinom(n2 - n12, n - n1, p2, log=TRUE))

    ifelse(p1 <= p2, 0, dev0 - dev1)
})

colloc$pval <- pchisq(colloc$chisq, df=1, lower.tail = FALSE)

print(n=100, colloc %>% arrange(desc(chisq)))

Source: local data frame [236,292 x 6]

                                    bigram    n1    n2  n12     chisq
1                                   of the 12003 26107 2804 4483.7424
2                                   in the  8358 26107 2173 3874.0079
3                             jack sparrow   464   231  222 3132.8576
4                            united states   221   275  179 2611.6980
5                                  i think  6664   729  415 2583.9190
6                                    to be 13510  2521  754 2438.7978
7                                    tz id   150   295  150 2310.9826
8                                hong kong   172   127  127 2147.3793
9                                 going to   525 13510  376 2073.7134
10                                  if you  1406  4777  381 1916.3666
11                                  i dont  6664   717  337 1915.5133
12                                 will be  1651  2521  321 1795.3247
13                                    i am  6664   407  264 1742.4935
14                               have been  2349   762  243 1659.6422
15                                  it was  4059  3042  410 1587.5769
16                                   it is  4059  5352  497 1567.0414
17                                  on the  3408 26107  887 1552.6807
18                                you know  4777   895  287 1549.4260
19                                has been   964   762  190 1544.3022
20                         elizabeth swann   163    96   93 1525.8272
21                                new york   794   149  124 1472.8689
22                             will turner  1651   149  135 1448.8616
23                               more than  1000   664  173 1406.2400
24                                the same 26107   300  269 1370.6457
25                                would be  1213  2521  235 1304.2399
26                               las vegas    69    68   64 1203.2063
27                                 this is  2781  5352  363 1182.1943
28                                 want to   385 13510  233 1157.6046
29                                 we have  2475  2349  257 1130.1066
30                                  do you  1309  4777  257 1104.5186
31                          postal service   140   257   82 1074.1557
32                                 able to   151 13510  149 1043.0517
33                                   a lot 10456   168  150 1035.1055
34                                   i was  6664  3042  373 1015.3806
35                              davy jones    58   117   58  999.7227
36                               there are  1115  2316  185  993.2465
37                               dont know   717   895  131  985.5995
38                               the first 26107   582  302  984.2828
39                          vice president    87   373   73  983.2494
40                                  i mean  6664   257  153  966.8531
41                                 you can  4777  1172  226  961.2615
42                                 to make 13510   534  234  957.5691
43                                at least  2229   108   95  944.6468
44                           jeffery clark    68    79   55  944.2783
45                                  let me   222  1303  103  918.6760
46                               thank you   142  4777  114  912.2991
47                                  one of  1454 12003  323  904.4365
48                                there is  1115  5352  217  878.6147
49                   contenttype textplain    46    46   44  872.8536
50                                  at the  2229 26107  536  854.4836
51                             ian malcolm    58    65   48  851.7200
52                              alan grant    74   126   55  850.7177
53                                does not   297  2263  115  842.1561
54                                    as a  2885 10456  395  836.4532
55                                  to get 13510   678  234  824.0167
56                                 part of   238 12003  153  823.2677
57                             gen horatio    42    40   40  814.9630
58                                  can be  1172  2521  167  813.1067
59                             rather than   121   664   73  806.3949
60                                   a few 10456   176  131  804.8453
61  charsetusascii contenttransferencoding    39    43   39  785.6280
62                                  out of  1037 12003  258  781.7230
63                textplain charsetusascii    46    39   39  773.0086
64                         horatio sanchez    40    55   40  766.5895
65                                 kind of   154 12003  124  764.5321
66                           sonoma county    66    74   46  763.7113
67                                  we can  2475  1172  159  763.2663
68                                 for the  4247 26107  734  762.4947
69                              the united 26107   221  170  756.3593
70                                   to do 13510  1309  290  746.9904
71                                 did not   514  2263  120  735.9569
72                             health care   154   145   56  730.4872
73                                  i have  6664  2349  277  730.1319
74                               they were  1508  1420  141  725.9595
75            messageid javamailevansthyme    38    35   35  715.5208
76                                 such as   346  2885  111  707.1761
77                               should be   405  2521  112  705.1471
78                                 need to   404 13510  174  703.8359
79                                  he was  1984  3042  190  702.5083
80                                 be able  2521   151   86  699.2811
81                               make sure   534   273   74  697.9953
82                                from the  2196 26107  484  695.3911
83                                   et al    50    94   42  695.2869
84                                  may be   527  2521  119  694.9955
85                              hong kongs   172    43   43  693.7095
86                                    is a  5352 10456  488  691.8020
87                         flying dutchman    59    38   36  660.9524
88                          cutler beckett    35    57   35  660.4837
89                             lord cutler    58    35   35  658.6070
90             contenttransferencoding bit    43   121   40  655.7139
91                               number of   195 12003  123  654.3068
92                                   but i  2018  6664  244  653.7842
93                                 as well  2885   750  130  649.9114
94                                  do not  1309  2263  142  640.7530
95                              subject re   287    69   48  635.8111
96                                 look at   287  2229   92  633.1685
97                                 he said  1984   931  122  628.9446
98                               there was  1115  3042  145  623.4056
99                          president bush   373    86   51  620.9797
100                              trying to   125 13510  103  620.2518
..                                     ...   ...   ...  ...       ...
Variables not shown: pval (dbl)

Hand-written sentence splitter

The Stanford CoreNLP library uses a hand-written sentence splitter with special cases for abbreviations like “Mr.”, “Ph.D.”, and

# initialize the coreNLP library; this fails unless you've already run downloadCoreNLP
coreNLP::initCoreNLP(annotators=c("tokenize", "ssplit"))

# annotate the sentence
s_core <-
do(masc %>% group_by(text_id), {
    anno <- coreNLP::annotateString(.$text)

    # extract the token boundaries
    tok <- coreNLP::getToken(anno)

    # determine the sentence boundaries
    (tok %>% group_by(sentence)
         %>% summarize(start = min(CharacterOffsetBegin),
                       end = max(CharacterOffsetEnd)))
}) %>% ungroup()

Trained supervised sentence splitter

ator <- openNLP::Maxent_Sent_Token_Annotator(language="en")
s_open <-
do(masc %>% group_by(text_id), {
    s <- NLP::as.String(.$text)

    spans <- NLP::annotate(s, ator)
    s_open <- as.data.frame(spans)
}) %>% ungroup()

Trained unsupervised sentence splitter

The Punkt sentence splitter available in Python, but not in R. Here is code using NLTK 3.0:

import json
import nltk.data

sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
infile = open('anc-masc.json', 'r')
outfile = open('anc-masc-punkt.json', 'w')

for line in infile:
    obj = json.loads(line)
    spans = sent_detector.span_tokenize(obj['text'])
    for i,s in enumerate(spans):
        json.dump({'text_id': obj['text_id'],
                   'sentence': i,
                   'begin': s[0],
                   'end': s[1]}, outfile)
        outfile.write('\n')

outfile.close()
infile.close()

We can read the results from Punkt into R.

punkt <- jsonlite::stream_in(file("anc-masc-punkt.json"), verbose=FALSE)

# punkt gives [begin,end), in 0-based indices, and does not include trailing
# punctuation. The following command converts to 1-based indexing and
# [begin,end] span conventions.
s_punkt <- punkt %>% mutate(begin = begin + 1)

Error rate comparison

loss <- function(truth, est) {
    tp <- length(intersect(truth, est)) # true positives
    fp <- length(est) - tp # false positives
    fn <- length(truth) - tp # false negatives
    tn <- max(truth) - (tp + fp + fn) # true negatives

    precision <- tp / (tp + fp)
    recall <- tp / (tp + fn)
    data_frame(precision, recall)
}

results <-
do(sent %>% group_by(text_id), {
    tid <- .$text_id[[1]]
    truth <- .$end
    est_core <- (s_core %>% filter(text_id == tid))$end
    est_open <- (s_open %>% filter(text_id == tid))$end
    est_punkt <- (s_punkt %>% filter(text_id == tid))$end

    l_core <- loss(truth, est_core)
    l_open <- loss(truth, est_open)
    l_punkt <- loss(truth, est_punkt)
    data_frame(text_id = tid,
               precision_core = l_core$precision, recall_core = l_core$recall,
               precision_open = l_open$precision, recall_open = l_open$recall,
               precision_punkt = l_punkt$precision, recall_punkt = l_punkt$recall)
}) %>% ungroup()

(results %>% left_join(masc, on="text_id")
         %>% group_by(mode, class)
         %>% summarize(precision_core = median(precision_core),
                       precision_open = median(precision_open),
                       precision_punkt = median(precision_punkt),
                       recall_core = median(recall_core),
                       recall_open = median(recall_open),
                       recall_punkt = median(recall_punkt)))

Source: local data frame [19 x 8]
Groups: mode

      mode             class precision_core precision_open precision_punkt
1   spoken  court-transcript      0.9888920      0.9792127       0.9869341
2   spoken debate-transcript      0.9902222      0.9905660       0.9875362
3   spoken      face-to-face      1.0000000      1.0000000       0.9846154
4   spoken         telephone      1.0000000      1.0000000       0.0000000
5  written              blog      0.9259259      0.9489796       0.9250000
6  written             email      0.9000000      0.6666667       0.8153409
7  written            essays      0.7871300      0.7993529       0.7754242
8  written           ficlets      0.9516129      0.9627451       0.9546436
9  written           fiction      0.9955357      0.9981343       0.9911111
10 written         govt-docs      0.9601329      0.9701493       0.9532374
11 written             jokes      0.8746439      0.9222670       0.8739516
12 written           journal      0.9803922      0.9677419       0.9583333
13 written           letters      1.0000000      0.9473684       0.9375000
14 written      movie-script      0.9235276      0.8868550       0.9130547
15 written         newspaper      1.0000000      1.0000000       0.8750000
16 written       non-fiction      0.9178571      0.9446494       0.9370370
17 written         technical      0.9682922      0.9537179       0.9442857
18 written     travel-guides      1.0000000      0.9640288       0.9655172
19 written           twitter      0.8560688      0.8378870       0.9348486
Variables not shown: recall_core (dbl), recall_open (dbl), recall_punkt (dbl)

with(results %>% left_join(masc, on="text_id")
             %>% filter(mode == "written" & class != "email"), {
    boxplot(recall_core, recall_open, recall_punkt,
            names=c("CoreNLP", "OpenNLP", "Punkt"))
})

plot of chunk unnamed-chunk-13

Common difficult cases

Session information

sessionInfo()

R version 3.2.3 (2015-12-10)
Platform: x86_64-apple-darwin13.4.0 (64-bit)
Running under: OS X 10.10.5 (Yosemite)

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

attached base packages:
[1] methods   stats     graphics  grDevices utils     datasets  base     

other attached packages:
[1] tm_0.6-2           openNLP_0.2-5      NLP_0.1-8          Matrix_1.2-3      
[5] coreNLP_0.4-1      jsonlite_0.9.16    dplyr_0.4.1        RColorBrewer_1.1-2
[9] knitr_1.12.3      

loaded via a namespace (and not attached):
 [1] Rcpp_0.11.5         magrittr_1.5        lattice_0.20-33    
 [4] R6_2.0.1            stringr_0.6.2       plyr_1.8.1         
 [7] tools_3.2.3         parallel_3.2.3      grid_3.2.3         
[10] plotrix_3.6-1       DBI_0.3.1           lazyeval_0.1.10    
[13] assertthat_0.1      digest_0.6.8        rJava_0.9-8        
[16] openNLPdata_1.5.3-2 formatR_1.1         codetools_0.2-14   
[19] evaluate_0.8        slam_0.1-32         XML_3.98-1.3