Analyzing Word Use in “The House of Mirth” With R

In this series, we’ll cover different aspects of the R programming language. This final post is a very simple project I completed for my Introduction to R class that looks at word use in the text “The House of Mirth”.

My final project was a basic analysis of the words used in the “The House of Mirth” and at what points they occurred in the text. I purposefully did not use any packages for the project as I wanted to attempt as much as I could with the skills I learned during my introductory course. As such, I have not removed any stop words nor completed any stemming. I have included the figures I created below for your reference. If you run the code yourself, you will want to run it in its divided chunks as the plots will overwrite themselves if you run it all at once. If you have any questions, feel free to contact me!


###Name: Cameron Cook
###Description: This code downloads a UTF-8 version text file
###of a novel from the Gutenberg Project. It then
###normalizes the text and looks at some words used by
###the author throughout the novel.

#URL of the novel
book <- "http://www.gutenberg.org/ebooks/284.txt.utf-8"

#filename to be used for downloaded text
dest <- "~/mirth.txt"

#download from the URL to the filename
download.file(book, dest, mode = "auto")

#Path name to data
path_to_data <- "~/mirth.txt"

#Read in the lines from the text as a character vector
#separate the text on new lines. Skip the first 40
#lines read-in(extra header information)
#trim extra whitespace, ignore blank lines
mirth <-
  scan(
    path_to_data,
    what = character(),
    sep = "\n",
    skip = 40,
    strip.white = TRUE,
    blank.lines.skip = TRUE
  )

#locate end of text denoted by "THE END"
bookEnd <- which(mirth == "THE END")

#Delete extra Project Gutenberg text at the end
mirth <- mirth[1:bookEnd]

#Split the text on spaces to get word tokens
#Unlist to make data easier to work with
mirth <- strsplit(mirth, " ")
mirth <- unlist(mirth)

#Normalize the text, convert to lowercase
#Strip out punctuation and trailing quotes
mirth <- tolower(mirth)
mirth <- gsub("\\.", "", mirth)
mirth <- gsub("\\-", "", mirth)
mirth <- gsub("!", "", mirth)
mirth <- gsub(",", "", mirth)
mirth <- gsub("'", "", mirth)
mirth <- gsub(";", "", mirth)
mirth <- gsub(":", "", mirth)
mirth <- gsub("\"", "", mirth)
mirth <- gsub("\\?", "", mirth)
mirth <- gsub("\\(", "", mirth)
mirth <- gsub("\\)", "", mirth)

#Find the unique words in the text
#And percentage of unique words
uniWords <- unique(mirth)
length(uniWords)
length(mirth)
length(uniWords) / length(mirth) * 100

#Put words into table format to sort, get wordcount
wordTable <- table(mirth)
wordTable <- sort(wordTable, decreasing = TRUE)

#Look at 300 most common words
#Proceed through words to see what is used by author
#Until wordcount becomes too small for significance
head(wordTable, n = 300)
wordTable[300:700]
wordTable[700:1100]
wordTable[1100:1500]

#Find the frequency of each word
freq <- 100 * (wordTable / sum(wordTable))
head(freq, n = 200)

#Frequencies of top 100 words
topWords <- freq[1:100]

#Histogram of top 100 Words
hist(topWords, xlab = "Top 100 Words", main = "Frequencies of Top 100 Words")

#Explore common, accepted themes of the text
#to get started with the data
#Plot to visualize how these thematic
#words cluster throughout the text
moneyPts <- which(mirth == "money")
money <- c(rep(1, length(moneyPts)))
plot(moneyPts, money, xlab = "House of Mirth by Word", ylab = "Occurence of Word")

marryPts <- which(mirth == "marry")
marry <- c(rep(1.1, length(marryPts)))
points(marryPts, marry, col = "red")

lovePts <- which(mirth == "love")
love <- c(rep(.9, length(lovePts)))
points(lovePts, love, col = "blue")

seldenPts <- which(mirth == "selden")
selden <- c(rep(.8, length(seldenPts)))
points(seldenPts, selden, col = "green")

#Add a legend to the plot
legend(

  "topright",
  legend = c("Money", "Marry", "Love", "Selden"),
  col = c("black", "red", "blue", "green"),
  pch = 1
  )

#Will make multiple plots of words so
#Use layout to make 9 plots, display them
layout(matrix(seq(1, 9), 3, 3, byrow = TRUE))
layout.show(9)

#Find chapter breaks in the text
#To be able to plot usage per chapter
chapter <- which(mirth == "chapter")

#Account for the final chapter by finding the end of text
chapter <- append(chapter, length(mirth))

#First set of words, colors will be passed for plotting
words <-
  list(
    c("money", "blue"),
    c("marry", "purple"),
    c("love", "orange"),
    c("light", "pink"),
    c("dress", "green"),
    c("eye", "red"),
    c("selden", "orange3"),
    c("life", "deeppink"),
    c("poor", "brown")
  )

#For each word
for (j in words) {
  #Initialize empty vector
  appearances <- c()
  #Intialize previous chapter variable
  prevChapter <- 3
    #For each chapter
    for (i in chapter[2:length(chapter)]) {
      #Create a vector containing start/end points for total pages
      vctr <- c(prevChapter:i)
      #Within those pages find the appearances of the word passed
      appearances <-
      append(appearances, length(which(mirth[vctr] == j[1])))
      #Reinitialize to continue through text  
      prevChapter <- i
   }
   #Plot the usage of the term by chapter with labels
   plot(
   appearances,
   col = j[2],
   type = "l",
   xlab = j[1],
   ylab = "# of Appearances",
   main = "Appearance by Chapter",
   sub = paste("rel.freq. = ", freq[j[1]])
   )
}

layout(matrix(seq(1, 9), 3, 3, byrow = TRUE))
layout.show(9)

words2 <-
  list(
    c("want", "blue"),
    c("cant", "purple"),
    c("moment", "orange"),
    c("sense", "pink"),
    c("beauty", "green"),
    c("friends", "red"),
    c("grace", "orange3"),
    c("young", "deeppink"),
    c("old", "brown")
  )

for (j in words2) {
  appearances <- c()
  prevChapter <- 3
   for (i in chapter[2:length(chapter)]) {
     vctr <- c(prevChapter:i)
     appearances <-
     append(appearances, length(which(mirth[vctr] == j[1])))
     prevChapter <- i
  }
  plot(
   appearances,
   col = j[2],
   type = "l",
   xlab = j[1],
   ylab = "# of Appearances",
   main = "Appearances by Chapter",
   sub = paste("rel.freq. = ", freq[j[1]])
  )
}

layout(matrix(seq(1, 9), 3, 3, byrow = TRUE))
layout.show(9)

words3 <-
  list(
    c("smile", "blue"),
    c("night", "purple"),
    c("morning", "orange"),
    c("touch", "pink"),
    c("miss", "green"),
    c("mrs", "red"),
    c("voice", "orange3"),
    c("feeling", "deeppink"),
    c("alone", "brown")
  )

for (j in words3) {
  appearances <- c()
  prevChapter <- 3
    for (i in chapter[2:length(chapter)]) {
     vctr <- c(prevChapter:i)
     appearances <-
     append(appearances, length(which(mirth[vctr] == j[1])))
     prevChapter <- i
  }
  plot(
    appearances,
    col = j[2],
    type = "l",
    xlab = j[1],
    ylab = "# of Appearances",
    main = "Appearances by Chapter",
   sub = paste("rel.freq. = ", freq[j[1]])
  )
}

layout(matrix(seq(1, 9), 3, 3, byrow = TRUE))
layout.show(9)

words4 <-
  list(
    c("dark", "blue"),
    c("clear", "purple"),
    c("quiet", "orange"),
    c("wish", "pink"),
    c("secret", "green"),
    c("frightened", "red"),
    c("husband", "orange3"),
    c("wife", "deeppink"),
    c("pride", "brown")
  )

for (j in words4) {
  appearances <- c()
  prevChapter <- 3
    for (i in chapter[2:length(chapter)]) {
     vctr <- c(prevChapter:i)
     appearances <-
     append(appearances, length(which(mirth[vctr] == j[1])))
     prevChapter <- i
  }
  plot(
    appearances,
    col = j[2],
    type = "l",
    xlab = j[1],
    ylab = "# of Appearances",
    main = "Appearances by Chapter",
    sub = paste("rel.freq. = ", freq[j[1]])
  )
}

layout(matrix(seq(1, 9), 3, 3, byrow = TRUE))
layout.show(9)

words5 <-
  list(
    c("luxury", "blue"),
    c("clothes", "purple"),
    c("fancy", "orange"),
    c("handsome", "pink"),
    c("influence", "green"),
    c("cigarette", "red"),
    c("fashionable", "orange3"),
    c("expensive", "deeppink"),
    c("shabby", "brown")
  )

for (j in words5) {
  appearances <- c()
  prevChapter <- 3
    for (i in chapter[2:length(chapter)]) {
     vctr <- c(prevChapter:i)
     appearances <-
     append(appearances, length(which(mirth[vctr] == j[1])))
     prevChapter <- i
  }
  plot(
   appearances,
   col = j[2],
   type = "l",
   xlab = j[1],
   ylab = "# of Appearances",
   main = "Appearances by Chapter",
   sub = paste("rel.freq. = ", freq[j[1]])
  )
}

layout(matrix(seq(1, 9), 3, 3, byrow = TRUE))
layout.show(9)

words6 <-
  list(
    c("gown", "blue"),
    c("vanity", "purple"),
    c("bills", "orange"),
    c("divorce", "pink"),
    c("dressmaker", "green"),
    c("temptation", "red"),
    c("fortune", "orange3"),
    c("gold", "deeppink"),
    c("emotion", "brown")
)

for (j in words6) {
  appearances <- c()
  prevChapter <- 3
    for (i in chapter[2:length(chapter)]) {
     vctr <- c(prevChapter:i)
     appearances <-
     append(appearances, length(which(mirth[vctr] == j[1])))
     prevChapter <- i
  }
  plot(
   appearances,
   col = j[2],
   type = "l",
   xlab = j[1],
   ylab = "# of Appearances",
   main = "Appearances by Chapter",
   sub = paste("rel.freq. = ", freq[j[1]])
  )
}

Screen Shot 2016-05-10 at 10.14.39 PM

Figure 1. Frequency of the top 100 words in the novel.

Screen Shot 2016-05-10 at 10.14.56 PM

Figure 2. Comparing obvious themes (money, love, marriage – used most often as marry in the text, and Selden) and where they cluster across the words in the text.

 

Screen Shot 2016-05-10 at 10.16.09 PM

Figure 3. The first set of words plotted for appearances by chapter.

 

Screen Shot 2016-05-10 at 10.16.55 PM

Figure 4. The second set of words plotted for appearances by chapter.

 

Screen Shot 2016-05-10 at 10.17.14 PM

Figure 5. The third set of words plotted for appearances by chapter.

 

Screen Shot 2016-05-10 at 10.17.54 PM

Figure 6. The fourth set of words plotted for appearances by chapter.

 

Screen Shot 2016-05-10 at 10.18.13 PM

Figure 7. The fifth set of words plotted for appearances by chapter.

 

Screen Shot 2016-05-10 at 10.18.30 PM

Figure 8. The sixth set of words plotted for appearances by chapter.