## Tweeter API ##
1. 200 tweets are extracted from hashtag “#california” and 200 from hashtag “#newyork”.
2. Then create 2 corpus from the 2 datasets.
3. Preprocess the corpus using {tm} package from R.
4. Compute and display the most frequent terms (words) in each corpus.
5. Create 2 word clouds from the most frequent terms.
6. Compute the sentiment scores, i.e. determine whether words used in the tweets are more positively or negatively charged (emotionally).
library(twitteR) library(ROAuth) #library(httr) library(tm) setwd("/Users/lisa/Downloads/") # t.api.key <- "xxxx" # t.api.secret <- "xxxx" # setup_twitter_oauth(t.api.key, t.api.secret, access_token=NULL, access_secret=NULL) # CA Tweets by tag # display.tweet <- function (tweet) { # cat("Screen name:", tweet$getScreenName(), # "\nText:", tweet$getText(), "\n\n") # } # catweets <- searchTwitter('#california', n = 200) # save(list="catweets", file="catweets.RData") load(file="catweets.RData") # using this method prevents your key and secret from stolen # for (t in catweets) { display.tweet(t)} catweets <- lapply(catweets, function(t) {t$getText()}) length(catweets) # NY Tweets by tag # nytweets <- searchTwitter('#newyork', n = 200) # save(list="nytweets", file="nytweets.RData") load(file="nytweets.RData") # for (t in nytweets) { display.tweet(t)} nytweets <- lapply(nytweets, function(t) {t$getText()}) length(nytweets) # create corpus dataca <- Corpus(VectorSource(catweets)) datany <- Corpus(VectorSource(nytweets)) # transformations #Reference for tryTolower(): Ted Kwartler ODSC Workshop: Intro to Text Mining using R tryTolower <- function(x){ # return NA when there is an error y = NA # tryCatch error try_error = tryCatch(tolower(x), error = function(e) e) # if not an error if (!inherits(try_error, 'error')) y = tolower(x) return(y) } removeURL <- function(x) {gsub("(http[^ ]*)", "", x)} english.stopwords <- stopwords("en") #CA # Gets this error when runnning these commands: # Warning message: # In mclapply(content(x), FUN, ...) : # all scheduled cores encountered errors in user code # cacorpus <- tm_map(dataca, content_transformer(tolower)) # cacorpus <- tm_map(cacorpus, content_transformer(removeNumberWords)) cacorpus <- tm_map(dataca, content_transformer(removePunctuation)) cacorpus <- tm_map(cacorpus, content_transformer(removeURL)) cacorpus <- tm_map(cacorpus, content_transformer(removeWords), english.stopwords) cacorpus <- tm_map(cacorpus, content_transformer(stemDocument)) cacorpus <- tm_map(cacorpus, content_transformer(stripWhitespace)) cacorpus <- tm_map(cacorpus, content_transformer(removeNumbers)) cacorpus <- tm_map(cacorpus, content_transformer(tryTolower)) #NY nycorpus <- tm_map(datany, content_transformer(removePunctuation)) nycorpus <- tm_map(nycorpus, content_transformer(removeURL)) nycorpus <- tm_map(nycorpus, content_transformer(removeWords), english.stopwords) nycorpus <- tm_map(nycorpus, content_transformer(stemDocument)) nycorpus <- tm_map(nycorpus, content_transformer(stripWhitespace)) nycorpus <- tm_map(nycorpus, content_transformer(removeNumbers)) nycorpus <- tm_map(nycorpus, content_transformer(tryTolower)) # inspect the first two values inspect(cacorpus[1:2]) inspect(nycorpus[1:2]) # Build the term document matrix tdmca <- TermDocumentMatrix(cacorpus) tdmny <- TermDocumentMatrix(nycorpus) # inspect part of the matrix inspect(tdmca[150:160, 20:30]) # <<TermDocumentMatrix (terms: 11, documents: 11)>> # Non-/sparse entries: 1/120 # Sparsity : 99% # Maximal term length: 13 # Weighting : term frequency (tf) # # Docs # Terms 20 21 22 23 24 25 26 27 28 29 30 # couture 0 0 0 0 0 0 0 0 0 0 0 # crane 0 0 0 0 0 0 0 0 0 0 0 # creek 0 0 0 0 0 0 0 0 1 0 0 # crownbeach 0 0 0 0 0 0 0 0 0 0 0 # cruise 0 0 0 0 0 0 0 0 0 0 0 # cruz 0 0 0 0 0 0 0 0 0 0 0 # cruzcrew 0 0 0 0 0 0 0 0 0 0 0 # cruzfamily 0 0 0 0 0 0 0 0 0 0 0 # cruztovictory 0 0 0 0 0 0 0 0 0 0 0 # cuadras 0 0 0 0 0 0 0 0 0 0 0 # customer 0 0 0 0 0 0 0 0 0 0 0 inspect(tdmny[150:160, 20:30]) # <<TermDocumentMatrix (terms: 11, documents: 11)>> # Non-/sparse entries: 3/118 # Sparsity : 98% # Maximal term length: 16 # Weighting : term frequency (tf) # # Docs # Terms 20 21 22 23 24 25 26 27 28 29 30 # delivery 0 0 0 0 0 0 0 0 0 0 0 # dem 0 0 0 0 0 0 0 0 0 0 1 # democrat 0 0 0 0 0 0 0 0 0 0 0 # democrats 0 1 1 0 0 0 0 0 0 0 0 # demprimary 0 0 0 0 0 0 0 0 0 0 0 # deportallmuslims 0 0 0 0 0 0 0 0 0 0 0 # dernière 0 0 0 0 0 0 0 0 0 0 0 # designer 0 0 0 0 0 0 0 0 0 0 0 # detroit 0 0 0 0 0 0 0 0 0 0 0 # development 0 0 0 0 0 0 0 0 0 0 0 # dial 0 0 0 0 0 0 0 0 0 0 0 # save(list=("tdmca"), file = "caTDM.RData") load(file="caTDM.RData") # save(list=("tdmny"), file = "nyTDM.RData") load(file="nyTDM.RData") # convert TDM to a matrix mca <- as.matrix(tdmca) mny <- as.matrix(tdmny) # View portion of the matrix mca[150:160, 20:30] mny[150:160, 20:30] # frequent terms head(sort(rowSums(mca), decreasing = T)) head(sort(rowSums(mny), decreasing = T)) findFreqTerms(tdmca, lowfreq=15) # california caprimary newyorkprimary trump albany longisland # 183 21 19 17 15 15 findFreqTerms(tdmny, lowfreq=20) # newyork nyprimary trump jobs now nyc # 174 37 33 21 21 20 # calculate the frequency of words wordFreqca <- rowSums(mca) wordFreqny <- rowSums(mny) # Sort the words by descending order of frequency # wordFreqca <- sort(mca, decreasing=TRUE) # wordFreqny <- sort(mny, decreasing=TRUE) # word cloud library(wordcloud) palette <- brewer.pal(8,"Dark2") set.seed(137) wordcloud(words=names(wordFreqca), freq=wordFreqca, min.freq=5, random.order=F, colors=palette) wordcloud(words=names(wordFreqny), freq=wordFreqny, min.freq=5, random.order=F, colors=palette)

#Sentiment sentiment <- function(text, pos.words, neg.words) { text <- gsub('[[:punct:]]', '', text) text <- gsub('[[:cntrl:]]', '', text) text <- gsub('\\d+', '', text) text <- tolower(text) # split the text into a vector of words words <- strsplit(text, '\\s+') words <- unlist(words) # find which words are positive pos.matches <- match(words, pos.words) pos.matches <- !is.na(pos.matches) # find which words are negative neg.matches <- match(words, neg.words) neg.matches <- !is.na(neg.matches) # calculate the sentiment score score <- sum(pos.matches) - sum(neg.matches) cat (" Positive: ", words[pos.matches], "\n") cat (" Negative: ", words[neg.matches], "\n") return (score) } sentiment.na <- function(text, pos.words, neg.words) { text <- gsub('[[:punct:]]', '', text) text <- gsub('[[:cntrl:]]', '', text) text <- gsub('\\d+', '', text) text <- tolower(text) # split the text into a vector of words words <- strsplit(text, '\\s+') words <- unlist(words) # find which words are positive pos.matches <- match(words, pos.words) pos.matches <- !is.na(pos.matches) # find which words are negative neg.matches <- match(words, neg.words) neg.matches <- !is.na(neg.matches) # calculate the sentiment score p <- sum(pos.matches) n <- sum(neg.matches) if (p == 0 & n == 0) return (NA) else return (p - n) } # Lexicons pos.words = scan('positive-words.txt', what='character', comment.char=';') neg.words = scan('negative-words.txt', what='character', comment.char=';') # head(pos.words); head(neg.words) # Source files dfca <- data.frame(text=unlist(sapply(cacorpus, `[`, "content")), stringsAsFactors=F) dfca <- dfca[,] dfny <- data.frame(text=unlist(sapply(nycorpus, `[`, "content")), stringsAsFactors=F) dfny <- dfny[,] sentiment(dfca[[1]], pos.words, neg.words) # Positive: best # Negative: # [1] 1 sentiment(dfny[[1]], pos.words, neg.words) # Positive: like trump # Negative: puppet # [1] 1 sink(tempfile()) par(mfrow=c(1,2)) scoresca <- sapply(dfca, sentiment, pos.words, neg.words) barplot(table(scoresca), xlab="Score", ylab="Count", col="cyan", main="#california\nwith cancelled words") scoresny <- sapply(dfny, sentiment, pos.words, neg.words) barplot(table(scoresny), xlab="Score", ylab="Count", col="cyan", main="#newyork \nwith cancelled words")

# Sentiment Scores without NA scoresca.na <- sapply(dfca, sentiment.na, pos.words, neg.words) barplot(table(scoresca.na), xlab="Score", ylab="Count", ylim=c(0,40), col="blue", main="#california\nw/o cancelled words") scoresny.na <- sapply(dfny, sentiment.na, pos.words, neg.words) barplot(table(scoresny.na), xlab="Score", ylab="Count", ylim=c(0,40), col="blue", main="#newyork\nw/o cancelled words")

addmargins(table(scoresca)) addmargins(table(scoresny)) addmargins(table(scoresca.na)) addmargins(table(scoresny.na)) # Data frame of scores and tweets # dfca.vector <- sapply(dfca,function (t) {(t)}) # x <- data.frame(Score=scoresca.na, Text=dfca.vector) # View(x)

###Sentiment scores summary:###
In general, tweets from both states have positive sentiments. However, it seem like tweets from #california appear to have a more negative connotation than #newyork.
## Facebook API ##
1. Consume 100 most recent Facebook posts by user “joebiden” using getPage() from R’s {RFacebook} package.
a. Find the most liked post and it’s popularity.
b. Find the most commented post and the number of comments.
c. Create a word cloud based on the most popular words used in the most commented post.
2. Consume 100 most recent Facebook posts containing the word “petaluma” using searchPages().
a. Rank the most frequent words and display a barplot of it.
# Use your credentials library(Rfacebook) #library(RCurl) #library(RJSONIO) library(ggplot2) # fb.app.id <- "xxxx" # fb.app.secret <- "xxxx" # # fb.oauth <- fbOAuth(app_id = fb.app.id, # app_secret = fb.app.secret, # extended_permissions = TRUE) # save(fb.oauth, file="facebook_credentials") par(mfrow=c(1,1)) # page <- getPage("joebiden", n=100, token=fb.oauth) # #Error in callAPI(url = url, token = token) : An unknown error occurred # page <- getPage("joebiden", n=50, token=fb.oauth) # #Error in callAPI(url = url, token = token) : # #Please reduce the amount of data you're asking for, then retry your request page <- getPage("joebiden", n=20, token=fb.oauth) # save(list=("page"), file = "jbfb.RData") load(file="jbfb.RData") # Bar plot of frequencies by type/category barplot(table(page$type), xlab = "category", ylab = "frequency") # Most liked post most.liked <- data.frame(link=page$link,freq=page$likes_count) most.liked[which.max(most.liked$freq),]
[https://www.facebook.com/joebiden/photos/a.10150487089926104.367464.7860876103/10151095211851104/?type=3][1]
Frequency: 73489 likes
# Most commented post most.commented <- data.frame(link=page$link,freq=page$comments_count) most.commented[which.max(most.commented$freq),]
[https://www.facebook.com/barackobama/photos/a.53081056748.66806.6815841748/10151374902831749/?type=3][2]
Frequency: 25255 comments
# post <- getPost(page[which.max(page$comments_count),]$id, token=fb.oauth) # save(list = "post",file = "fbpost.RData") load(file = "fbpost.RData") # comments from the most popular post comments <- post$comments$message #create corpus corpus <- Corpus(VectorSource(comments)) # clean corpus corpus <- tm_map(corpus, content_transformer(tryTolower)) corpus <- tm_map(corpus, content_transformer(removePunctuation)) corpus <- tm_map(corpus, content_transformer(removeNumbers)) corpus <- tm_map(corpus, content_transformer(removeURL)) corpus <- tm_map(corpus, content_transformer(removeWords), english.stopwords) corpus <- tm_map(corpus, content_transformer(removeWords), c("the")) corpus <- tm_map(corpus, content_transformer(stemDocument)) corpus <- tm_map(corpus, content_transformer(stripWhitespace)) #create DTM dtm <- DocumentTermMatrix(corpus, control = list(wordLengths=c(2,Inf), bounds = list(global = c(5,Inf)))) ;dtm # Learn the Term frequencies freq <- colSums(as.matrix(dtm)) # Group the terms findFreqTerms(dtm,max(freq)) # Find the most common term. In this case "gun" ord <- order(freq, decreasing = T) # Ordering the frequencies (ord contains the indices)) freq[head(ord)] # Most frequent terms & their frequency # gun people will can president us # 162 146 128 112 103 93 # Create a Word Cloud Plot library(wordcloud) set.seed(123) wordcloud(names(freq), freq, min.freq=30, colors=brewer.pal(6, "Dark2"))

# using searchPages() to get category pages <- searchPages(string = "petaluma", n = 100, token = fb.oauth) save(pages, file = "petpages.RData") load(file="petpages.RData") wf <- data.frame(word=names(table(pages$category)), freq=as.vector(table(pages$category))) p <- ggplot(wf, aes(word, freq)) p <- p + geom_bar(stat="identity") p <- p + theme(axis.text.x=element_text(size=12,angle=90, hjust=1)) p

[1]: https://www.facebook.com/joebiden/photos/a.10150487089926104.367464.7860876103/10151095211851104/?type=3
[2]: https://www.facebook.com/barackobama/photos/a.53081056748.66806.6815841748/10151374902831749/?type=3