In this series, we’ll cover different aspects of the R programming language. This final post is a very simple project I completed for my Introduction to R class that looks at word use in the text “The House of Mirth”.
My final project was a basic analysis of the words used in the “The House of Mirth” and at what points they occurred in the text. I purposefully did not use any packages for the project as I wanted to attempt as much as I could with the skills I learned during my introductory course. As such, I have not removed any stop words nor completed any stemming. I have included the figures I created below for your reference. If you run the code yourself, you will want to run it in its divided chunks as the plots will overwrite themselves if you run it all at once. If you have any questions, feel free to contact me!
###Name: Cameron Cook ###Description: This code downloads a UTF-8 version text file ###of a novel from the Gutenberg Project. It then ###normalizes the text and looks at some words used by ###the author throughout the novel.
#URL of the novel book <- "http://www.gutenberg.org/ebooks/284.txt.utf-8"
#filename to be used for downloaded text dest <- "~/mirth.txt"
#download from the URL to the filename download.file(book, dest, mode = "auto")
#Path name to data path_to_data <- "~/mirth.txt"
#Read in the lines from the text as a character vector #separate the text on new lines. Skip the first 40 #lines read-in(extra header information) #trim extra whitespace, ignore blank lines mirth <- scan( path_to_data, what = character(), sep = "\n", skip = 40, strip.white = TRUE, blank.lines.skip = TRUE )
#locate end of text denoted by "THE END" bookEnd <- which(mirth == "THE END")
#Delete extra Project Gutenberg text at the end mirth <- mirth[1:bookEnd]
#Split the text on spaces to get word tokens #Unlist to make data easier to work with mirth <- strsplit(mirth, " ") mirth <- unlist(mirth)
#Normalize the text, convert to lowercase #Strip out punctuation and trailing quotes mirth <- tolower(mirth) mirth <- gsub("\\.", "", mirth) mirth <- gsub("\\-", "", mirth) mirth <- gsub("!", "", mirth) mirth <- gsub(",", "", mirth) mirth <- gsub("'", "", mirth) mirth <- gsub(";", "", mirth) mirth <- gsub(":", "", mirth) mirth <- gsub("\"", "", mirth) mirth <- gsub("\\?", "", mirth) mirth <- gsub("\\(", "", mirth) mirth <- gsub("\\)", "", mirth)
#Find the unique words in the text #And percentage of unique words uniWords <- unique(mirth) length(uniWords) length(mirth) length(uniWords) / length(mirth) * 100
#Put words into table format to sort, get wordcount wordTable <- table(mirth) wordTable <- sort(wordTable, decreasing = TRUE)
#Look at 300 most common words #Proceed through words to see what is used by author #Until wordcount becomes too small for significance head(wordTable, n = 300) wordTable[300:700] wordTable[700:1100] wordTable[1100:1500]
#Find the frequency of each word freq <- 100 * (wordTable / sum(wordTable)) head(freq, n = 200)
#Frequencies of top 100 words topWords <- freq[1:100]
#Histogram of top 100 Words hist(topWords, xlab = "Top 100 Words", main = "Frequencies of Top 100 Words")
#Explore common, accepted themes of the text #to get started with the data #Plot to visualize how these thematic #words cluster throughout the text moneyPts <- which(mirth == "money") money <- c(rep(1, length(moneyPts))) plot(moneyPts, money, xlab = "House of Mirth by Word", ylab = "Occurence of Word")
marryPts <- which(mirth == "marry") marry <- c(rep(1.1, length(marryPts))) points(marryPts, marry, col = "red")
lovePts <- which(mirth == "love") love <- c(rep(.9, length(lovePts))) points(lovePts, love, col = "blue")
seldenPts <- which(mirth == "selden") selden <- c(rep(.8, length(seldenPts))) points(seldenPts, selden, col = "green")
#Add a legend to the plot legend( "topright", legend = c("Money", "Marry", "Love", "Selden"), col = c("black", "red", "blue", "green"), pch = 1 )
#Will make multiple plots of words so #Use layout to make 9 plots, display them layout(matrix(seq(1, 9), 3, 3, byrow = TRUE)) layout.show(9)
#Find chapter breaks in the text #To be able to plot usage per chapter chapter <- which(mirth == "chapter")
#Account for the final chapter by finding the end of text chapter <- append(chapter, length(mirth))
#First set of words, colors will be passed for plotting words <- list( c("money", "blue"), c("marry", "purple"), c("love", "orange"), c("light", "pink"), c("dress", "green"), c("eye", "red"), c("selden", "orange3"), c("life", "deeppink"), c("poor", "brown") )
#For each word for (j in words) { #Initialize empty vector appearances <- c() #Intialize previous chapter variable prevChapter <- 3 #For each chapter for (i in chapter[2:length(chapter)]) { #Create a vector containing start/end points for total pages vctr <- c(prevChapter:i) #Within those pages find the appearances of the word passed appearances <- append(appearances, length(which(mirth[vctr] == j[1]))) #Reinitialize to continue through text prevChapter <- i } #Plot the usage of the term by chapter with labels plot( appearances, col = j[2], type = "l", xlab = j[1], ylab = "# of Appearances", main = "Appearance by Chapter", sub = paste("rel.freq. = ", freq[j[1]]) ) }
layout(matrix(seq(1, 9), 3, 3, byrow = TRUE)) layout.show(9)
words2 <- list( c("want", "blue"), c("cant", "purple"), c("moment", "orange"), c("sense", "pink"), c("beauty", "green"), c("friends", "red"), c("grace", "orange3"), c("young", "deeppink"), c("old", "brown") )
for (j in words2) { appearances <- c() prevChapter <- 3 for (i in chapter[2:length(chapter)]) { vctr <- c(prevChapter:i) appearances <- append(appearances, length(which(mirth[vctr] == j[1]))) prevChapter <- i } plot( appearances, col = j[2], type = "l", xlab = j[1], ylab = "# of Appearances", main = "Appearances by Chapter", sub = paste("rel.freq. = ", freq[j[1]]) ) }
layout(matrix(seq(1, 9), 3, 3, byrow = TRUE)) layout.show(9)
words3 <- list( c("smile", "blue"), c("night", "purple"), c("morning", "orange"), c("touch", "pink"), c("miss", "green"), c("mrs", "red"), c("voice", "orange3"), c("feeling", "deeppink"), c("alone", "brown") )
for (j in words3) { appearances <- c() prevChapter <- 3 for (i in chapter[2:length(chapter)]) { vctr <- c(prevChapter:i) appearances <- append(appearances, length(which(mirth[vctr] == j[1]))) prevChapter <- i } plot( appearances, col = j[2], type = "l", xlab = j[1], ylab = "# of Appearances", main = "Appearances by Chapter", sub = paste("rel.freq. = ", freq[j[1]]) ) }
layout(matrix(seq(1, 9), 3, 3, byrow = TRUE)) layout.show(9)
words4 <- list( c("dark", "blue"), c("clear", "purple"), c("quiet", "orange"), c("wish", "pink"), c("secret", "green"), c("frightened", "red"), c("husband", "orange3"), c("wife", "deeppink"), c("pride", "brown") )
for (j in words4) { appearances <- c() prevChapter <- 3 for (i in chapter[2:length(chapter)]) { vctr <- c(prevChapter:i) appearances <- append(appearances, length(which(mirth[vctr] == j[1]))) prevChapter <- i } plot( appearances, col = j[2], type = "l", xlab = j[1], ylab = "# of Appearances", main = "Appearances by Chapter", sub = paste("rel.freq. = ", freq[j[1]]) ) }
layout(matrix(seq(1, 9), 3, 3, byrow = TRUE)) layout.show(9)
words5 <- list( c("luxury", "blue"), c("clothes", "purple"), c("fancy", "orange"), c("handsome", "pink"), c("influence", "green"), c("cigarette", "red"), c("fashionable", "orange3"), c("expensive", "deeppink"), c("shabby", "brown") )
for (j in words5) { appearances <- c() prevChapter <- 3 for (i in chapter[2:length(chapter)]) { vctr <- c(prevChapter:i) appearances <- append(appearances, length(which(mirth[vctr] == j[1]))) prevChapter <- i } plot( appearances, col = j[2], type = "l", xlab = j[1], ylab = "# of Appearances", main = "Appearances by Chapter", sub = paste("rel.freq. = ", freq[j[1]]) ) }
layout(matrix(seq(1, 9), 3, 3, byrow = TRUE)) layout.show(9)
words6 <- list( c("gown", "blue"), c("vanity", "purple"), c("bills", "orange"), c("divorce", "pink"), c("dressmaker", "green"), c("temptation", "red"), c("fortune", "orange3"), c("gold", "deeppink"), c("emotion", "brown") )
for (j in words6) { appearances <- c() prevChapter <- 3 for (i in chapter[2:length(chapter)]) { vctr <- c(prevChapter:i) appearances <- append(appearances, length(which(mirth[vctr] == j[1]))) prevChapter <- i } plot( appearances, col = j[2], type = "l", xlab = j[1], ylab = "# of Appearances", main = "Appearances by Chapter", sub = paste("rel.freq. = ", freq[j[1]]) ) }
Figure 1. Frequency of the top 100 words in the novel.
Figure 2. Comparing obvious themes (money, love, marriage – used most often as marry in the text, and Selden) and where they cluster across the words in the text.
Figure 3. The first set of words plotted for appearances by chapter.
Figure 4. The second set of words plotted for appearances by chapter.
Figure 5. The third set of words plotted for appearances by chapter.
Figure 6. The fourth set of words plotted for appearances by chapter.
Figure 7. The fifth set of words plotted for appearances by chapter.
Figure 8. The sixth set of words plotted for appearances by chapter.