Introduction to R for Data Science :: Session 8 [Intro to Text Mining in R, ML Estimation + Binomial Logistic Regression]

Introduction to R for Data Science

Lecturers

dipl. ing Branko Kovač

Data Analyst at CUBE/Data Science Mentor

at Springboard

Data Science Serbia

[email protected]

dr Goran S. Milovanović

Data Scientist at DiploFoundation

Data Science Serbia

[email protected]

[email protected]

mailto:[email protected]



Text-Mining in R + Binomial

Logistic Regression

• Part 1: Basics of Text-mining in R

with the tm package

• Web-scraping w. tm.plugin.webmining

• tm package corpora structures

• Metadata and content

• Text transformations

• Term-Document Matrix extraction with

tf-idf weighting

• Part 2: Introduction to Binomial

Logistic Regression

• Binomial Logistic Regression from

Generalized Linear Models with glm() in

R

• Basic model assessment

Intro to R for Data Science

Session 8: Intro to text-mining in R + Binomial Logistic Regression

######################################################## # Introduction to R for Data Science

# SESSION 8 :: 16 June, 2016 # Binomiral Logistic Regression + Intro to Text Mining in R # Data Science Community Serbia + Startit

# :: Goran S. Milovanović and Branko Kovač :: ########################################################

# libraries library(tm)

library(tm.plugin.webmining)

# source: Google Finance # search queries: .com vs. hardware companies searchQueries <-

list(c('NASDAQ:GOOGL','NASDAQ:AMZN','NASDAQ:JD','NASDAQ:FB','NYSE:BABA'), c('NYSE:HPQ','NASDAQ:AAPL','KRX:005930','TPE:2354','NYSE:IBM'));

Intro to Text-Mining in R: tm + tm.plugin.webming packages



######################################################## # Introduction to R for Data Science

# SESSION 8 :: 16 June, 2016 # Binomiral Logistic Regression + Intro to Text Mining in R # Data Science Community Serbia + Startit

# :: Goran S. Milovanović and Branko Kovač :: ########################################################

# retrive w. tm.plugin.webmining # retrieve news for dotCom

dotCom <- lapply(searchQueries[[1]], function(x) { WebCorpus(GoogleFinanceSource(x))

}); # now retrieve news for hardware companies hardware <- lapply(searchQueries[[2]], function(x) {

WebCorpus(GoogleFinanceSource(x)) });

Retrieving GoogleFinance from tm.plugin.webmining



# source: Google News searchQueriesNews <- list(c("Google", "Amazon", "JD.com", "Facebook", "Alibaba"),

c("Hewlett Packard", "Apple", "Samsung", "Foxconn", "IBM")) # retrieve news for dotCom companies

dotComNews <- lapply(searchQueriesNews[[1]], function(x) { googleNewsSRC <- GoogleNewsSource(x,

params = list(hl = "en", q = x, ie = "utf-8",

num = 30, output = "rss"))

WebCorpus(googleNewsSRC) });

Retrieving GoogleNews from tm.plugin.webmining



# tm corpora are lists

# a tm corpus (WebCorpus, more specifically) dotCom[[1]] # a PlaintTextDocument in the first dotCom corpus

class(dotCom[[1]][[1]]) dotCom[[1]][[1]]

# document metadata dotCom[[1]][[1]]$meta # document content

dotCom[[1]][[1]]$content

# let's add another tag the document metadata structure: dotCom dotCom <- lapply(dotCom, function(x) { x <- tm_map(x, function(doc) { # tm_map works over tm corpora, similarly to lapply

meta(doc, "category") <- "dotCom" return(doc)

}) }) dotCom[[1]][[1]]$meta

tm corpora: metadata and content



# add a category tag to the document metadata structure: dotComNews dotComNews <- lapply(dotComNews, function(x) {

x <- tm_map(x, function(doc) { meta(doc, "category") <- "dotCom" return(doc)

}) })

dotComNews[[1]][[1]]$meta

Combining tm corpora



# combine corpora w. do.call() and c() dotCom <- do.call(c, dotCom) # do.call comes handy; similar to lapply

# Learn more about do.call: http://www.stat.berkeley.edu/~s133/Docall.html hardware <- do.call(c, hardware) dotComNews <- do.call(c, dotComNews)

hardwareNews <- do.call(c, hardwareNews) workCorpus <- c(dotCom, dotComNews, hardware, hardwareNews)

#### Part II Text Preprocessing

### text pre-processing workCorpus[[1]]$content # we need to clean-up the docs # reminder: https://stat.ethz.ch/R-manual/R-devel/library/base/html/regex.html

# Regex in R removeSpecial <- function(x) {

# replacing w. space might turn out to be handy x$content <- gsub("\\t|\\r|\\n"," ",x$content) return(x)

} # example:

cleanDoc <- removeSpecial(workCorpus[[1]]) cleanDoc$content # removeSpecial with tm_map {tm}

workCorpus <- tm_map(workCorpus, removeSpecial) workCorpus[[1]]$content

Text Preprocessing w. tm



#### {tm} by the book text pre-processing

# remove punctuation workCorpus <- tm_map(workCorpus, removePunctuation) # built in workCorpus[[1]]$content

# remove numbers

workCorpus <- tm_map(workCorpus, removeNumbers) # built in workCorpus[[1]]$content

# all characters tolower toLower <- function(x) {

x$content <- tolower(x$content) return(x) }

workCorpus <- tm_map(workCorpus, toLower) workCorpus[[1]]$content

# remove stop words workCorpus <- tm_map(workCorpus, removeWords, stopwords("english")) # built in

workCorpus[[1]]$content

Typical preprocessing steps



# stemming # see: http://nlp.stanford.edu/IR-book/html/htmledition/stemming-and-lemmatization-1.html

library(SnowballC) # nice stemming algorithm (Porter, 1980) # see: https://cran.r-project.org/web/packages/SnowballC/index.html workCorpus <- tm_map(workCorpus, stemDocument)


# remove potentially dangerous predictors: company names # Why? Why? (In a nutshell: we are trying to discover something here, # not to 'confirm' our potentially redundant knowledge)

predWords <- tolower(c("Alphabet", "Google", "Amazon", "JD.com", "Facebook", "Alibaba", "HP", "Hewlett Packard", "Apple", "Samsung", "Foxconn", "IBM"))

predWordsRegex <- paste(stemDocument(predWords),collapse="|"); replacePreds <- function(x) { x$content <- gsub(predWordsRegex,

" ", x$content)

return(x) } workCorpus <- tm_map(workCorpus, replacePreds)


Stemming



#### Part III Feature Selection: Term-Document Frequency Matrix (TDM)

# TDM # rows = terms; columns = docs # we will use the td-idf weighting

# see: https://en.wikipedia.org/wiki/Tf%E2%80%93idf dT <- TermDocumentMatrix(workCorpus,

control = list(tolower = FALSE, wordLengths <- c(3, Inf), weighting = weightTfIdf)

)

# dT is a *sparse matrix*; maybe you want to learn more about the R {slam} package # https://cran.r-project.org/web/packages/slam/index.html class(dT)

dT$i # rows w. non-zero entries dT$j # columns w. non-zero entries

dT$v # non-zero entry in [i,j] dT$nrow # "true" row dimension dT$ncol # "true" column dimension

dT$dimnames$Terms # self-explanatory dT$dimnames$Docs # self-explanatory

Term-Document Matrix from tm: tf-idf weighting



# remove sparse terms docTerm <- removeSparseTerms(dT, sparse = .75) # sparse is in (0,1]

dim(docTerm) docTerm$dimnames$Terms

# see: [1] http://www.inside-r.org/packages/cran/tm/docs/removeSparseTerms # A term-document matrix where those terms from x are removed which have

# at least a sparse percentage of empty (i.e., terms occurring 0 times in a document) # elements. [1]

# NOTE: **very important**; in a real-world application # you would probably need to run many models with features obtained from various levels of sparcity

# and perform model selection.

Sparse terms



# as.matrix docTerm <- as.matrix(docTerm)

colnames(docTerm) <- paste0("d",seq(1:dim(docTerm)[2])) # pick some words w

w <- sample(1:length(tfIdf),4,replace = F) par(mfcol = c(2,2))

for (i in 1:length(w)) { hist(docTerm[w[i],], main = paste0("Distribution of

'",names(tfIdf)[w[i]],"'"), cex.main = .85,

xlab = "Tf-Idf", ylab = "Count" )

} # quite interesting, isn't it? - How do you perform regression with these?

The distribution of tf-idf scores



# How well can the selected words predict the document category?

# How to relate continuous, non-normally distributed predictors, to a categorical outcome? # Idea: Logistic function

par(mfcol=c(1,1)) logistic <- function(t) {exp(t)/(1+exp(t))}

curve(logistic, from = -10, to = 10, n = 1000, main="Logistic Function", cex.main = .85,

xlab = "t", ylab = "Logistic(t)",

cex.lab = .85) # and then let t be b0 + b1*x1 + b2*x2 + ... + bn*xn

Logistic function



# prepare data set dataSet <- data.frame(t(docTerm))

dataSet$Category <- as.character(meta(workCorpus, tag="category")) table(dataSet$Category)

# does every word play at least some role in each category? w1 <- which(colSums(dataSet[which(dataSet$Category == "dotCom"), 1:(dim(dataSet)[2]-1)])==0)

colnames(dataSet)[w1] w2 <- which(colSums(dataSet[which(dataSet$Category == "hardware"), 1:(dim(dataSet)[2]-1)])==0) colnames(dataSet)[w2]

# recode category

library(plyr) dataSet$Category <- as.numeric(revalue(dataSet$Category, c("dotCom"=1, "hardware"=0)))

Prepare Binomial Logistic Model



#### Logistic Regression # Binomial Logistic Regression: use glm w. logit link (link='logit' is default)

bLRmodel <- glm(Category ~., family=binomial(link='logit'), control = list(maxit = 500),

data=dataSet) sumLR <- summary(bLRmodel)

sumLR # Coefficients

# NOTE: coefficients here relate a unit change in the predictor to the logit[P(Outcome)] # logit(p) = log(p/(1-p)) - also known as log-odds

sumLR$coefficients class(sumLR$coefficients) coefLR <- as.data.frame(sumLR$coefficients)

# Wald statistics significant? (this Wald z is normally distributed) coefLR <- coefLR[order(-coefLR$Estimate), ]

w <- which((coefLR$`Pr(>|z|)` < .05)&(!(rownames(coefLR) == "(Intercept)"))) # which predictors worked? rownames(coefLR)[w]

# NOTE: Wald statistic (z) is dangerous: as the coefficient gets higher, its standard error # inflates thus underestimating z; Beware of z...

Binomial Logistic Regression



# plot coefficients {ggplot2} library(ggplot2)

plotFrame <- coefLR[w,] plotFrame$Estimate <- round(plotFrame$Estimate,2)

plotFrame$Features <- rownames(plotFrame) plotFrame <- plotFrame[order(-

plotFrame$Estimate), ] plotFrame$Features <- factor(plotFrame$Features, levels =

plotFrame$Features, ordered=T) ggplot(data = plotFrame, aes(x =

plotFrame$Features, y = plotFrame$Estimate)) + geom_line(group=1) + geom_point(color="red", size=2.5) + geom_point(color="white", size=2) +

xlab("Features") + ylab("Regression Coefficients") +

ggtitle("Logistic Regression: Coeficients (sig. Wald test)") + theme(axis.text.x = element_text(angle=90))

Coefficients



# fitted probabilities fitted(bLRmodel)

hist(fitted(bLRmodel),50) plot(density(fitted(bLRmodel)), main = "Predicted Probabilities: Density")

polygon(density(fitted(bLRmodel)), col="red",

border="black")

Fitted probabilities



# coefficients related to odds (and not log-odds): simply exp(bLRmodel$coefficients)[w]

# check max max(exp(bLRmodel$coefficients)[w]) # huge? why? - think!

#### Reminder: Maximum Likelihood Estimation normData <- rnorm(10000, mean = 5.75, sd = 1.25)

normLogLike <- function(params,x) { mean <- params[1] sd <- abs(params[2]) # dnorm generates NaNs if sd < 0

dens <- dnorm(x,mean,sd) w <- which(dens==0)

dens[w] <- .Machine$double.xmin return(-(sum(log(dens)))) # negative logLike, for minimization w. optim() }

# ML estimation

# random initial values startMean <- runif(1,-100,100) startSd <- runif(1,-100,100)

mlFit <- optim(c(startMean, startSd), fn = normLogLike,

x=normData, control = list(maxit = 50000)) # ML estimates

mlFit$par # cmp. true paramaters: mean = 5.75, sd = 1.25

Reminder: Maximum-Likelihood Estimation



# model Chi-Square CSq <- bLRmodel$null.deviance - bLRmodel$deviance # this difference ~ Chi-Square Distribution

CSq dfCSq <- bLRmodel$df.null - bLRmodel$df.residual # null - residual (model) degrees of freedom dfCSq

# Chi-Square significance test in R: pCSq = 1-pchisq(CSq, dfCSq) # 1 - c.d.f. = P(Chi-Square larger than this obtained by chance)

pCSq # AIC = Akaike information criterion (-2*LogLikelihood+2*k, k = num.parameters)

bLRmodel$aic

Χ2 and Akaike Information Criterion



#### w. training vs. test data set # split into test and training

dim(dataSet) choice <- sample(1:475,250,replace = F) test <- which(!(c(1:475) %in% choice))

trainData <- dataSet[choice,] newData <- dataSet[test,]

# check! sum(dataSet$Category[choice])/length(choice) # proportion of dotCom in training sum(dataSet$Category[test])/length(choice) # proportion of dotCom in test

# Binomial Logistic Regression: use glm w. logit link

bLRmodel <- glm(Category ~., family=binomial(link='logit'), control = list(maxit = 500),

data=trainData)

sumLR <- summary(bLRmodel) sumLR

Training and test data



# fitted probabilities fitted(bLRmodel)

hist(fitted(bLRmodel),50) plot(density(fitted(bLRmodel)), main = "Predicted Probabilities: Density")

polygon(density(fitted(bLRmodel)), col="red",

border="black") # Prediction from the model

predictions <- predict(bLRmodel, newdata=newData,

type='response') predictions <- ifelse(predictions >= 0.5,1,0)

trueCategory <- newData$Category

meanClasError <- mean(predictions != trueCategory) accuracy <- 1-meanClasError

accuracy # probably rather poor..? - Why? - Think!

Predictions from the Binomial Logistic Regression Model



Try to train a binomial regression model many

times by randomly assigning documents to the training and test data set

What happens? Why?

*Look* at your data set and *think* about

it before actually modeling it.

Education

Introduction to R for Data Science :: Session 8 [Intro to Text Mining in R, ML Estimation + Binomial Logistic Regression]