Patrick O. Perry, NYU Stern School of Business
We will use the following R packages.
library("jsonlite")
library("coreNLP")
library("Matrix")
library("NLP")
library("openNLP")
library("stringi")
To ensure consistent runs, we set the seed before performing any analysis.
set.seed(0)
We will analyze a subset of Yelp Academic Dataset corresponding to reviews of 500 restaurants nearest to Columbia University (as of October 15, 2012). To get this data, take the following steps:
Visit Yelp's developer page to create a Yelp API account and log in to your account.
Visit Yelp's academic dataset page, then click on the “Download the dataset” button (in the “Access” section). The button will only be visible after logging in to your Yelp API Account.
At this point, you should have a file called yelp_academic_dataset.json.gz
.
Run the 01_make_json.py
and 02_subset_nyc.py
scripts, available from the
course webpage, to generate yelp-nyc-business.json
and
yelp-nyc-review.json
. You will need Python version 3.4 or later.
After downloading and pre-processing the data, you can load it into R. First, a random sample of 50 businesses.
nbus <- 50
business <- jsonlite::stream_in(file("yelp-nyc-business.json"), verbose=FALSE)
business <- business[sample(nrow(business), nbus),]
business <- business[order(business$name),] # sort alphabetically
Next, the reviews of those businesses.
review <- jsonlite::stream_in(file("yelp-nyc-review.json"), verbose=FALSE)
review <- review[review$business_id %in% business$business_id,]
coreNLP::initCoreNLP(annotators=c("tokenize", "ssplit", "pos"))
adj_core <- vector("list", nrow(business))
len_core <- numeric(nrow(business))
print(Sys.time())
[1] "2016-02-16 09:03:32 EST"
##pb <- txtProgressBar(0, nrow(business), style=3)
# for each business
for (b in seq_len(nrow(business))) {
##setTxtProgressBar(pb, b)
# extract the reviews for that business
b_id <- business[b, "business_id"]
rev <- review[review$business_id == b_id,]
# iterate over all reviews for that business and tabulate
# the total length and adjective counts
nword <- 0
tab <- numeric()
for (r in seq_len(nrow(rev))) {
# annotate (POS tag) the review
anno <- coreNLP::annotateString(rev[r, "text"])
# extract the token information
token <- coreNLP::getToken(anno)
# map to the universal tagset
ut <- coreNLP::universalTagset(token$POS)
# update the word count
nword <- nword + sum(ut != ".")
# extract the adjectives
raw_tok <- token[ut == "ADJ", "token"]
# normalize case
tok <- stringi::stri_trans_nfkc_casefold(raw_tok)
# count the occurrences
t1 <- table(tok)
# update the table with the word counts
ix <- match(names(t1), names(tab))
new <- is.na(ix)
old <- !new
tab[ix[old]] <- tab[ix[old]] + t1[old] # increment existing counts
tab <- c(tab, t1[new]) # append new words
}
len_core[[b]] <- nword
adj_core[[b]] <- sort(tab, decreasing=TRUE)
}
##close(pb)
print(Sys.time())
[1] "2016-02-16 09:05:57 EST"
# (running time was ~30 minutes on my laptop)
adj_open <- vector("list", nrow(business))
len_open <- numeric(nrow(business))
sent_ator <- openNLP::Maxent_Sent_Token_Annotator()
word_ator <- openNLP::Maxent_Word_Token_Annotator()
pos_ator <- openNLP::Maxent_POS_Tag_Annotator()
print(Sys.time())
[1] "2016-02-16 09:05:58 EST"
##pb <- txtProgressBar(0, nrow(business), style=3)
tagmap <- NLP::Universal_POS_tags_map[["en-ptb"]]
tagmap["#"] <- "." # missing as of NLP_0.1-8
# for each business
for (b in seq_len(nrow(business))) {
##setTxtProgressBar(pb, b)
# extract the reviews for that business
b_id <- business[b, "business_id"]
rev <- review[review$business_id == b_id,]
# iterate over all reviews for that business and tabulate
# the total length and adjective counts
nword <- 0
tab <- numeric()
for (r in seq_len(nrow(rev))) {
txt <- rev[r, "text"]
if (is.na(txt) || txt == "") {
next # skip the review if the text is empty
}
# convert the review text to String
s <- NLP::as.String(txt)
# tokenize into sentences and words
a2 <- NLP::annotate(s, list(sent_ator, word_ator))
# annotate with POS
a3 <- NLP::annotate(s, pos_ator, a2)
# extract the words, and their tags
a3w <- subset(a3, type == "word")
tags <- sapply(a3w$features, `[[`, "POS")
# map to the universal tagset
ut <- as.vector(tagmap[tags])
# update the word count
nword <- nword + sum(ut != ".")
# extract the adjectives
a3w_adj <- a3w[ut == "ADJ"]
if (length(a3w_adj) == 0) {
next # skip the review if there are no adjectives
}
# extract the string, and normalize the case
raw_tok <- s[a3w_adj]
tok <- stringi::stri_trans_nfkc_casefold(raw_tok)
# count the occurrences
t1 <- table(tok)
# update the adjective table with the new word counts
ix <- match(names(t1), names(tab))
new <- is.na(ix)
old <- !new
tab[ix[old]] <- tab[ix[old]] + t1[old] # increment existing counts
tab <- c(tab, t1[new]) # append new words
}
len_open[[b]] <- nword
adj_open[[b]] <- sort(tab, decreasing=TRUE)
}
##close(pb)
print(Sys.time())
[1] "2016-02-16 09:07:10 EST"
# (running time was ~15 minutes on my laptop)
Here are the top-two adjectives for each restaurant as reported by CoreNLP and OpenNLP:
data.frame(name=substr(business$name, 1, 20),
core_adj1=sapply(adj_core, function(x) names(x)[1]),
core_adj2=sapply(adj_core, function(x) names(x)[2]),
open_adj1=sapply(adj_open, function(x) names(x)[1]),
open_adj2=sapply(adj_open, function(x) names(x)[2]))
name core_adj1 core_adj2 open_adj1 open_adj2
1 120th Street Halal C halal tasty better efficient
2 999 Lounge nice good good nice
3 Africa Kine Restaura good nice good african
4 American Apparel american friendly american friendly
5 Amsterdam Cleaners & dry single dry single
6 Apple Tree Supermark good great good great
7 Barnes and Noble - C good cute good cute
8 Bioh Dominick MD PC attentive great attentive great
9 Bistro Ten 18 good great good great
10 Bodre Cut and Color good amazing good amazing
11 Bon French Cleaners better dry dry expensive
12 Book Culture great new great new
13 Butler Library different more different main
14 Cafe Nana good little good little
15 Carlton Tower Cleane great little great "little
16 Carnegie Nails & Spa little clean little clean
17 Casbah Rouge good great good great
18 Columbia Hardware an helpful knowledgeable helpful knowledgeable
19 Columbia University great amazing great amazing
20 Hana Sushi other crunchy other better
21 Jas Mart little japanese little japanese
22 Karrot great nice great nice
23 Koronet Pizza good big good big
24 La Belle great good great good
25 Minton's Playhouse great greatest great open
26 Morningside Dog Run small clean small clean
27 Move-it 4 Less more nice more nice
28 New Young Fish best cheap best cheap
29 O'Connell's Pub good great good great
30 One Cup Two Cupcakes good different good different
31 Oren's Daily Roast good friendly good friendly
32 Panda Garden Chinese chinese cheap chinese cheap
33 Panino Sportivo Roma good great good great
34 Patrick Ryan's good great good great
35 Riverside Animal Hos last additional last additional
36 Shangri-La Express good delicious good delicious
37 Sol La Ti's Music To particular clean particular clean
38 Thaddeus Harden Phot much great great much
39 The Cathedral Church great beautiful great beautiful
40 The Heights Bar & Gr good happy good happy
41 US Post Office other great little other
42 Vareli good great good great
43 Verizon Wireless other awful other awful
44 Wai Lee Chinese Rest chinese good good chinese
45 West Place chinese good chinese cheap
46 Westside Family Medi same rude fine same
47 Westside Market NYC good fresh good fresh
48 Westway Cafe good friendly good friendly
49 WKCR 89.9 FM least new least new
50 Yarntopia other first other first
sessionInfo()
R version 3.2.3 (2015-12-10)
Platform: x86_64-apple-darwin13.4.0 (64-bit)
Running under: OS X 10.10.5 (Yosemite)
locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
attached base packages:
[1] methods stats graphics grDevices utils datasets base
other attached packages:
[1] stringi_1.0-1 openNLP_0.2-5 NLP_0.1-8 Matrix_1.2-3 coreNLP_0.4-1
[6] jsonlite_0.9.16 RColorBrewer_1.1-2 knitr_1.12.3
loaded via a namespace (and not attached):
[1] Rcpp_0.11.5 codetools_0.2-14 lattice_0.20-33 XML_3.98-1.3
[5] digest_0.6.8 plyr_1.8.1 grid_3.2.3 formatR_1.1
[9] evaluate_0.8 openNLPdata_1.5.3-2 tools_3.2.3 stringr_0.6.2
[13] plotrix_3.6-1 rJava_0.9-8