Social Mining using R

## Tweeter API ##

1. 200 tweets are extracted from hashtag “#california” and 200 from hashtag “#newyork”.
2. Then create 2 corpus from the 2 datasets.
3. Preprocess the corpus using {tm} package from R.
4. Compute and display the most frequent terms (words) in each corpus.
5. Create 2 word clouds from the most frequent terms.
6. Compute the sentiment scores, i.e. determine whether words used in the tweets are more positively or negatively charged (emotionally).

library(twitteR)
library(ROAuth)
#library(httr)
library(tm)
setwd("/Users/lisa/Downloads/")
# t.api.key <- "xxxx"
# t.api.secret <- "xxxx"

# setup_twitter_oauth(t.api.key, t.api.secret, access_token=NULL, access_secret=NULL)

# CA Tweets by tag
# display.tweet <- function (tweet) {
#   cat("Screen name:", tweet$getScreenName(), 
#       "\nText:", tweet$getText(), "\n\n")
# }
# catweets <- searchTwitter('#california', n = 200)

# save(list="catweets", file="catweets.RData")
load(file="catweets.RData") # using this method prevents your key and secret from stolen
# for (t in catweets) { display.tweet(t)}
catweets <- lapply(catweets, function(t) {t$getText()})
length(catweets)

# NY Tweets by tag
# nytweets <- searchTwitter('#newyork', n = 200)
# save(list="nytweets", file="nytweets.RData")

load(file="nytweets.RData")
# for (t in nytweets) { display.tweet(t)}
nytweets <- lapply(nytweets, function(t) {t$getText()})
length(nytweets)

# create corpus
dataca <- Corpus(VectorSource(catweets))
datany <- Corpus(VectorSource(nytweets))

# transformations
#Reference for tryTolower(): Ted Kwartler ODSC Workshop: Intro to Text Mining using R
tryTolower <- function(x){
  # return NA when there is an error
  y = NA
  # tryCatch error
  try_error = tryCatch(tolower(x), error = function(e) e)
  # if not an error
  if (!inherits(try_error, 'error'))
    y = tolower(x)
  return(y)
}

removeURL <- function(x) {gsub("(http[^ ]*)", "", x)}
english.stopwords <- stopwords("en")

#CA
# Gets this error when runnning these commands:
# Warning message:
#   In mclapply(content(x), FUN, ...) :
#   all scheduled cores encountered errors in user code
# cacorpus <- tm_map(dataca, content_transformer(tolower))
# cacorpus <- tm_map(cacorpus, content_transformer(removeNumberWords))

cacorpus <- tm_map(dataca, content_transformer(removePunctuation))
cacorpus <- tm_map(cacorpus, content_transformer(removeURL))
cacorpus <- tm_map(cacorpus, content_transformer(removeWords), english.stopwords)
cacorpus <- tm_map(cacorpus, content_transformer(stemDocument))
cacorpus <- tm_map(cacorpus, content_transformer(stripWhitespace))
cacorpus <- tm_map(cacorpus, content_transformer(removeNumbers))
cacorpus <- tm_map(cacorpus, content_transformer(tryTolower))

#NY
nycorpus <- tm_map(datany, content_transformer(removePunctuation))
nycorpus <- tm_map(nycorpus, content_transformer(removeURL))
nycorpus <- tm_map(nycorpus, content_transformer(removeWords), english.stopwords)
nycorpus <- tm_map(nycorpus, content_transformer(stemDocument))
nycorpus <- tm_map(nycorpus, content_transformer(stripWhitespace))
nycorpus <- tm_map(nycorpus, content_transformer(removeNumbers))
nycorpus <- tm_map(nycorpus, content_transformer(tryTolower))

# inspect the first two values
inspect(cacorpus[1:2])
inspect(nycorpus[1:2])

# Build the term document matrix
tdmca <- TermDocumentMatrix(cacorpus)
tdmny <- TermDocumentMatrix(nycorpus)

# inspect part of the matrix

inspect(tdmca[150:160, 20:30])
# <<TermDocumentMatrix (terms: 11, documents: 11)>>
#   Non-/sparse entries: 1/120
# Sparsity           : 99%
# Maximal term length: 13
# Weighting          : term frequency (tf)
# 
# Docs
# Terms           20 21 22 23 24 25 26 27 28 29 30
# couture        0  0  0  0  0  0  0  0  0  0  0
# crane          0  0  0  0  0  0  0  0  0  0  0
# creek          0  0  0  0  0  0  0  0  1  0  0
# crownbeach     0  0  0  0  0  0  0  0  0  0  0
# cruise         0  0  0  0  0  0  0  0  0  0  0
# cruz           0  0  0  0  0  0  0  0  0  0  0
# cruzcrew       0  0  0  0  0  0  0  0  0  0  0
# cruzfamily     0  0  0  0  0  0  0  0  0  0  0
# cruztovictory  0  0  0  0  0  0  0  0  0  0  0
# cuadras        0  0  0  0  0  0  0  0  0  0  0
# customer       0  0  0  0  0  0  0  0  0  0  0

inspect(tdmny[150:160, 20:30])
# <<TermDocumentMatrix (terms: 11, documents: 11)>>
#   Non-/sparse entries: 3/118
# Sparsity           : 98%
# Maximal term length: 16
# Weighting          : term frequency (tf)
# 
# Docs
# Terms              20 21 22 23 24 25 26 27 28 29 30
# delivery          0  0  0  0  0  0  0  0  0  0  0
# dem               0  0  0  0  0  0  0  0  0  0  1
# democrat          0  0  0  0  0  0  0  0  0  0  0
# democrats         0  1  1  0  0  0  0  0  0  0  0
# demprimary        0  0  0  0  0  0  0  0  0  0  0
# deportallmuslims  0  0  0  0  0  0  0  0  0  0  0
# dernière          0  0  0  0  0  0  0  0  0  0  0
# designer          0  0  0  0  0  0  0  0  0  0  0
# detroit           0  0  0  0  0  0  0  0  0  0  0
# development       0  0  0  0  0  0  0  0  0  0  0
# dial              0  0  0  0  0  0  0  0  0  0  0

# save(list=("tdmca"), file = "caTDM.RData")
load(file="caTDM.RData")

# save(list=("tdmny"), file = "nyTDM.RData")
load(file="nyTDM.RData")

# convert TDM to a matrix
mca <- as.matrix(tdmca)
mny <- as.matrix(tdmny)

# View portion of the matrix
mca[150:160, 20:30]
mny[150:160, 20:30]

# frequent terms
head(sort(rowSums(mca), decreasing = T))
head(sort(rowSums(mny), decreasing = T))

findFreqTerms(tdmca, lowfreq=15)
# california      caprimary newyorkprimary          trump         albany     longisland 
# 183             21             19             17             15             15 

findFreqTerms(tdmny, lowfreq=20)
# newyork nyprimary     trump      jobs       now       nyc 
# 174        37        33        21        21        20 

# calculate the frequency of words 
wordFreqca <- rowSums(mca)
wordFreqny <- rowSums(mny)

# Sort the words by descending order of frequency
# wordFreqca <- sort(mca, decreasing=TRUE)
# wordFreqny <- sort(mny, decreasing=TRUE)

# word cloud
library(wordcloud)

palette <- brewer.pal(8,"Dark2")

set.seed(137)
wordcloud(words=names(wordFreqca), freq=wordFreqca, min.freq=5, random.order=F, colors=palette)
wordcloud(words=names(wordFreqny), freq=wordFreqny, min.freq=5, random.order=F, colors=palette)

![tweetwc.png](/site_media/media/cb630e760e8d1.png)

#Sentiment
sentiment <- function(text, pos.words, neg.words) {
  text <- gsub('[[:punct:]]', '', text)
  text <- gsub('[[:cntrl:]]', '', text)
  text <- gsub('\\d+', '', text)
  text <- tolower(text)
  # split the text into a vector of words
  words <- strsplit(text, '\\s+')
  words <- unlist(words)
  # find which words are positive
  pos.matches <- match(words, pos.words)
  pos.matches <- !is.na(pos.matches)
  # find which words are negative
  neg.matches <- match(words, neg.words)
  neg.matches <- !is.na(neg.matches)
  # calculate the sentiment score
  score <- sum(pos.matches) - sum(neg.matches)
  cat (" Positive: ", words[pos.matches], "\n")
  cat (" Negative: ", words[neg.matches], "\n")
  return (score)
}

sentiment.na <- function(text, pos.words, neg.words) {
  text <- gsub('[[:punct:]]', '', text)
  text <- gsub('[[:cntrl:]]', '', text)
  text <- gsub('\\d+', '', text)
  text <- tolower(text)
  # split the text into a vector of words
  words <- strsplit(text, '\\s+')
  words <- unlist(words)
  # find which words are positive
  pos.matches <- match(words, pos.words)
  pos.matches <- !is.na(pos.matches)
  # find which words are negative
  neg.matches <- match(words, neg.words)
  neg.matches <- !is.na(neg.matches)
  # calculate the sentiment score
  p <- sum(pos.matches)
  n <- sum(neg.matches)
  if (p == 0 & n == 0)
    return (NA)
  else
    return (p - n)
}

# Lexicons
pos.words = scan('positive-words.txt',
                 what='character',
                 comment.char=';')

neg.words = scan('negative-words.txt',  
                 what='character', 
                 comment.char=';')

# head(pos.words); head(neg.words)

# Source files
dfca <- data.frame(text=unlist(sapply(cacorpus, `[`, "content")), stringsAsFactors=F)
dfca <- dfca[,]
dfny <- data.frame(text=unlist(sapply(nycorpus, `[`, "content")), stringsAsFactors=F)
dfny <- dfny[,]

sentiment(dfca[[1]], pos.words, neg.words)
# Positive:  best 
# Negative:   
#   [1] 1
sentiment(dfny[[1]], pos.words, neg.words)
# Positive:  like trump 
# Negative:  puppet 
# [1] 1

sink(tempfile())
par(mfrow=c(1,2))
scoresca <- sapply(dfca, sentiment, pos.words, neg.words)
barplot(table(scoresca), xlab="Score", ylab="Count", col="cyan", main="#california\nwith cancelled words")
scoresny <- sapply(dfny, sentiment, pos.words, neg.words)
barplot(table(scoresny), xlab="Score", ylab="Count", col="cyan", main="#newyork \nwith cancelled words")

![withneutralsentiment.png](/site_media/media/6c48bd8c0e881.png)

# Sentiment Scores without NA
scoresca.na <- sapply(dfca, sentiment.na, pos.words, neg.words)
barplot(table(scoresca.na), xlab="Score", ylab="Count", ylim=c(0,40), col="blue", main="#california\nw/o cancelled words")
scoresny.na <- sapply(dfny, sentiment.na, pos.words, neg.words)
barplot(table(scoresny.na), xlab="Score", ylab="Count", ylim=c(0,40), col="blue", main="#newyork\nw/o cancelled words")

![noneutralsentiment.png](/site_media/media/6ff743360e881.png)

addmargins(table(scoresca))
addmargins(table(scoresny))
addmargins(table(scoresca.na))
addmargins(table(scoresny.na))

# Data frame of scores and tweets
# dfca.vector <- sapply(dfca,function (t) {(t)})
# x <- data.frame(Score=scoresca.na, Text=dfca.vector)
# View(x)

![sentimentscores.png](/site_media/media/7d5429b20e891.png)
###Sentiment scores summary:###
In general, tweets from both states have positive sentiments. However, it seem like tweets from #california appear to have a more negative connotation than #newyork.

## Facebook API ##
1. Consume 100 most recent Facebook posts by user “joebiden” using getPage() from R’s {RFacebook} package.
a. Find the most liked post and it’s popularity.
b. Find the most commented post and the number of comments.
c. Create a word cloud based on the most popular words used in the most commented post.
2. Consume 100 most recent Facebook posts containing the word “petaluma” using searchPages().
a. Rank the most frequent words and display a barplot of it.

# Use your credentials
library(Rfacebook)
#library(RCurl)
#library(RJSONIO)
library(ggplot2)   

# fb.app.id <- "xxxx"
# fb.app.secret <- "xxxx"
# 
# fb.oauth <- fbOAuth(app_id = fb.app.id,
#                     app_secret = fb.app.secret,
#                     extended_permissions = TRUE)
# save(fb.oauth, file="facebook_credentials")

par(mfrow=c(1,1))

# page <- getPage("joebiden", n=100, token=fb.oauth)
# #Error in callAPI(url = url, token = token) : An unknown error occurred
# page <- getPage("joebiden", n=50, token=fb.oauth)
# #Error in callAPI(url = url, token = token) : 
# #Please reduce the amount of data you're asking for, then retry your request
page <- getPage("joebiden", n=20, token=fb.oauth)

# save(list=("page"), file = "jbfb.RData")
load(file="jbfb.RData")

# Bar plot of frequencies by type/category
barplot(table(page$type), xlab = "category", ylab = "frequency")

# Most liked post
most.liked <- data.frame(link=page$link,freq=page$likes_count)
most.liked[which.max(most.liked$freq),]

[https://www.facebook.com/joebiden/photos/a.10150487089926104.367464.7860876103/10151095211851104/?type=3][1]
Frequency: 73489 likes

# Most commented post
most.commented <- data.frame(link=page$link,freq=page$comments_count)
most.commented[which.max(most.commented$freq),]

[https://www.facebook.com/barackobama/photos/a.53081056748.66806.6815841748/10151374902831749/?type=3][2]
Frequency: 25255 comments

# post <- getPost(page[which.max(page$comments_count),]$id, token=fb.oauth)
# save(list = "post",file = "fbpost.RData")
load(file = "fbpost.RData")

# comments from the most popular post
comments <- post$comments$message

#create corpus
corpus <- Corpus(VectorSource(comments))

# clean corpus
corpus <- tm_map(corpus, content_transformer(tryTolower))
corpus <- tm_map(corpus, content_transformer(removePunctuation))
corpus <- tm_map(corpus, content_transformer(removeNumbers))
corpus <- tm_map(corpus, content_transformer(removeURL))
corpus <- tm_map(corpus, content_transformer(removeWords), english.stopwords)
corpus <- tm_map(corpus, content_transformer(removeWords), c("the"))
corpus <- tm_map(corpus, content_transformer(stemDocument))
corpus <- tm_map(corpus, content_transformer(stripWhitespace))

#create DTM
dtm <- DocumentTermMatrix(corpus, control = list(wordLengths=c(2,Inf), bounds = list(global = c(5,Inf)))) ;dtm

# Learn the Term frequencies
freq <- colSums(as.matrix(dtm)) # Group the terms
findFreqTerms(dtm,max(freq)) # Find the most common term. In this case "gun"
ord <- order(freq, decreasing = T) # Ordering the frequencies (ord contains the indices))
freq[head(ord)] # Most frequent terms & their frequency
# gun    people      will       can president        us 
# 162       146       128       112       103        93 

# Create a Word Cloud Plot
library(wordcloud)
set.seed(123)
wordcloud(names(freq), freq, min.freq=30, colors=brewer.pal(6, "Dark2"))

![biden.png](/site_media/media/1c3ecd0e0e8d1.png)

# using searchPages() to get category
pages <- searchPages(string = "petaluma", n = 100, 
                     token = fb.oauth)

save(pages, file = "petpages.RData")
load(file="petpages.RData")

wf <- data.frame(word=names(table(pages$category)), freq=as.vector(table(pages$category)))
p <- ggplot(wf, aes(word, freq))    
p <- p + geom_bar(stat="identity")   
p <- p + theme(axis.text.x=element_text(size=12,angle=90, hjust=1))   
p 

![petaluma.png](/site_media/media/1fc8f3780e8d1.png)

[1]: https://www.facebook.com/joebiden/photos/a.10150487089926104.367464.7860876103/10151095211851104/?type=3
[2]: https://www.facebook.com/barackobama/photos/a.53081056748.66806.6815841748/10151374902831749/?type=3

Advertisements

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s