Part-of-Speech Tagging

Patrick O. Perry, NYU Stern School of Business

Computing environment

We will use the following R packages.

library("jsonlite")
library("coreNLP")
library("Matrix")
library("NLP")
library("openNLP")
library("stringi")

To ensure consistent runs, we set the seed before performing any analysis.

set.seed(0)

Data

We will analyze a subset of Yelp Academic Dataset corresponding to reviews of 500 restaurants nearest to Columbia University (as of October 15, 2012). To get this data, take the following steps:

  1. Visit Yelp's developer page to create a Yelp API account and log in to your account.

  2. Visit Yelp's academic dataset page, then click on the “Download the dataset” button (in the “Access” section). The button will only be visible after logging in to your Yelp API Account.

At this point, you should have a file called yelp_academic_dataset.json.gz. Run the 01_make_json.py and 02_subset_nyc.py scripts, available from the course webpage, to generate yelp-nyc-business.json and yelp-nyc-review.json. You will need Python version 3.4 or later.

After downloading and pre-processing the data, you can load it into R. First, a random sample of 50 businesses.

nbus <- 50
business <- jsonlite::stream_in(file("yelp-nyc-business.json"), verbose=FALSE)
business <- business[sample(nrow(business), nbus),]
business <- business[order(business$name),] # sort alphabetically

Next, the reviews of those businesses.

review <- jsonlite::stream_in(file("yelp-nyc-review.json"), verbose=FALSE)
review <- review[review$business_id %in% business$business_id,]

Tagging with CoreNLP

coreNLP::initCoreNLP(annotators=c("tokenize", "ssplit", "pos"))

adj_core <- vector("list", nrow(business))
len_core <- numeric(nrow(business))

print(Sys.time())
[1] "2016-02-16 09:03:32 EST"
##pb <- txtProgressBar(0, nrow(business), style=3)

# for each business
for (b in seq_len(nrow(business))) {
    ##setTxtProgressBar(pb, b)

    # extract the reviews for that business
    b_id <- business[b, "business_id"]
    rev <- review[review$business_id == b_id,]

    # iterate over all reviews for that business and tabulate
    # the total length and adjective counts
    nword <- 0
    tab <- numeric()

    for (r in seq_len(nrow(rev))) {

        # annotate (POS tag) the review
        anno <- coreNLP::annotateString(rev[r, "text"])

        # extract the token information
        token <- coreNLP::getToken(anno)

        # map to the universal tagset
        ut <- coreNLP::universalTagset(token$POS)

        # update the word count
        nword <- nword + sum(ut != ".")

        # extract the adjectives
        raw_tok <- token[ut == "ADJ", "token"]

        # normalize case
        tok <- stringi::stri_trans_nfkc_casefold(raw_tok)

        # count the occurrences
        t1 <- table(tok)

        # update the table with the word counts
        ix <- match(names(t1), names(tab))
        new <- is.na(ix)
        old <- !new
        tab[ix[old]] <- tab[ix[old]] + t1[old] # increment existing counts
        tab <- c(tab, t1[new]) # append new words
    }

    len_core[[b]] <- nword
    adj_core[[b]] <- sort(tab, decreasing=TRUE)
}
##close(pb)
print(Sys.time())
[1] "2016-02-16 09:05:57 EST"
# (running time was ~30 minutes on my laptop)

Tagging with OpenNLP

adj_open <- vector("list", nrow(business))
len_open <- numeric(nrow(business))

sent_ator <- openNLP::Maxent_Sent_Token_Annotator()
word_ator <- openNLP::Maxent_Word_Token_Annotator()
pos_ator <- openNLP::Maxent_POS_Tag_Annotator()

print(Sys.time())
[1] "2016-02-16 09:05:58 EST"
##pb <- txtProgressBar(0, nrow(business), style=3)

tagmap <- NLP::Universal_POS_tags_map[["en-ptb"]]
tagmap["#"] <- "." # missing as of NLP_0.1-8 

# for each business
for (b in seq_len(nrow(business))) {
    ##setTxtProgressBar(pb, b)

    # extract the reviews for that business
    b_id <- business[b, "business_id"]
    rev <- review[review$business_id == b_id,]

    # iterate over all reviews for that business and tabulate
    # the total length and adjective counts
    nword <- 0
    tab <- numeric()

    for (r in seq_len(nrow(rev))) {
        txt <- rev[r, "text"]
        if (is.na(txt) || txt == "") {
            next # skip the review if the text is empty
        }

        # convert the review text to String
        s <- NLP::as.String(txt)

        # tokenize into sentences and words
        a2 <- NLP::annotate(s, list(sent_ator, word_ator))

        # annotate with POS
        a3 <- NLP::annotate(s, pos_ator, a2)

        # extract the words, and their tags
        a3w <- subset(a3, type == "word")
        tags <- sapply(a3w$features, `[[`, "POS")

        # map to the universal tagset
        ut <- as.vector(tagmap[tags])

        # update the word count
        nword <- nword + sum(ut != ".")

        # extract the adjectives
        a3w_adj <- a3w[ut == "ADJ"]

        if (length(a3w_adj) == 0) {
            next # skip the review if there are no adjectives
        }

        # extract the string, and normalize the case
        raw_tok <- s[a3w_adj]
        tok <- stringi::stri_trans_nfkc_casefold(raw_tok)

        # count the occurrences
        t1 <- table(tok)

        # update the adjective table with the new word counts
        ix <- match(names(t1), names(tab))
        new <- is.na(ix)
        old <- !new
        tab[ix[old]] <- tab[ix[old]] + t1[old] # increment existing counts
        tab <- c(tab, t1[new]) # append new words
    }

    len_open[[b]] <- nword
    adj_open[[b]] <- sort(tab, decreasing=TRUE)
}
##close(pb)
print(Sys.time())
[1] "2016-02-16 09:07:10 EST"
# (running time was ~15 minutes on my laptop)

Results

Here are the top-two adjectives for each restaurant as reported by CoreNLP and OpenNLP:

data.frame(name=substr(business$name, 1, 20),
           core_adj1=sapply(adj_core, function(x) names(x)[1]),
           core_adj2=sapply(adj_core, function(x) names(x)[2]),
           open_adj1=sapply(adj_open, function(x) names(x)[1]),
           open_adj2=sapply(adj_open, function(x) names(x)[2]))
                   name  core_adj1     core_adj2  open_adj1     open_adj2
1  120th Street Halal C      halal         tasty     better     efficient
2            999 Lounge       nice          good       good          nice
3  Africa Kine Restaura       good          nice       good       african
4      American Apparel   american      friendly   american      friendly
5  Amsterdam Cleaners &        dry        single        dry        single
6  Apple Tree Supermark       good         great       good         great
7  Barnes and Noble - C       good          cute       good          cute
8   Bioh Dominick MD PC  attentive         great  attentive         great
9         Bistro Ten 18       good         great       good         great
10 Bodre Cut and Color        good       amazing       good       amazing
11  Bon French Cleaners     better           dry        dry     expensive
12         Book Culture      great           new      great           new
13       Butler Library  different          more  different          main
14            Cafe Nana       good        little       good        little
15 Carlton Tower Cleane      great        little      great       "little
16 Carnegie Nails & Spa     little         clean     little         clean
17         Casbah Rouge       good         great       good         great
18 Columbia Hardware an    helpful knowledgeable    helpful knowledgeable
19 Columbia University       great       amazing      great       amazing
20           Hana Sushi      other       crunchy      other        better
21             Jas Mart     little      japanese     little      japanese
22               Karrot      great          nice      great          nice
23        Koronet Pizza       good           big       good           big
24             La Belle      great          good      great          good
25   Minton's Playhouse      great      greatest      great          open
26  Morningside Dog Run      small         clean      small         clean
27       Move-it 4 Less       more          nice       more          nice
28       New Young Fish       best         cheap       best         cheap
29      O'Connell's Pub       good         great       good         great
30 One Cup Two Cupcakes       good     different       good     different
31   Oren's Daily Roast       good      friendly       good      friendly
32 Panda Garden Chinese    chinese         cheap    chinese         cheap
33 Panino Sportivo Roma       good         great       good         great
34       Patrick Ryan's       good         great       good         great
35 Riverside Animal Hos       last    additional       last    additional
36   Shangri-La Express       good     delicious       good     delicious
37 Sol La Ti's Music To particular         clean particular         clean
38 Thaddeus Harden Phot       much         great      great          much
39 The Cathedral Church      great     beautiful      great     beautiful
40 The Heights Bar & Gr       good         happy       good         happy
41       US Post Office      other         great     little         other
42               Vareli       good         great       good         great
43     Verizon Wireless      other         awful      other         awful
44 Wai Lee Chinese Rest    chinese          good       good       chinese
45           West Place    chinese          good    chinese         cheap
46 Westside Family Medi       same          rude       fine          same
47  Westside Market NYC       good         fresh       good         fresh
48         Westway Cafe       good      friendly       good      friendly
49         WKCR 89.9 FM      least           new      least           new
50            Yarntopia      other         first      other         first

Session information

sessionInfo()
R version 3.2.3 (2015-12-10)
Platform: x86_64-apple-darwin13.4.0 (64-bit)
Running under: OS X 10.10.5 (Yosemite)

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

attached base packages:
[1] methods   stats     graphics  grDevices utils     datasets  base     

other attached packages:
[1] stringi_1.0-1      openNLP_0.2-5      NLP_0.1-8          Matrix_1.2-3       coreNLP_0.4-1     
[6] jsonlite_0.9.16    RColorBrewer_1.1-2 knitr_1.12.3      

loaded via a namespace (and not attached):
 [1] Rcpp_0.11.5         codetools_0.2-14    lattice_0.20-33     XML_3.98-1.3       
 [5] digest_0.6.8        plyr_1.8.1          grid_3.2.3          formatR_1.1        
 [9] evaluate_0.8        openNLPdata_1.5.3-2 tools_3.2.3         stringr_0.6.2      
[13] plotrix_3.6-1       rJava_0.9-8