You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
P1.1: Accessing Wikipedia “Billboard Hot 100 Era” top singles by year
Read the wikipedia page for the years of “Billboard Hot 100 Era” top
singles
# Reading page for "Billboard Hot 100 Era" top singlesbillboard_singles<- read_html("https://en.wikipedia.org/wiki/List_of_Billboard_number-one_singles")
# Getting list of all years where Billboard had a "Hot 100 Era" listhot100_years_full<-billboard_singles %>%
html_nodes("tbody") %>%
html_nodes("tr") %>%
html_nodes("td") %>%
html_nodes("a") %>%
html_text()
# Trimming the list above to the years of "Hot 100 Era" only (1958-2019)hot100_years_all<-hot100_years_full[c(23:62)] #(1:62) #trimming the list of yearshot100_years_all
P1.2: Get the table of top songs for a particular year
hot100_years<-"2005"# Here you can choose a specific year to look athot100_list<- c() # Creating an empty list for the songsfor(iinhot100_years){
hot100_session<- html_session("https://en.wikipedia.org/wiki/List_of_Billboard_number-one_singles") # Initializing HTML Sessionhot100_link<-hot100_session %>%
follow_link(i)
hot100_chart<-hot100_link %>% # Get the song charts for a specific year# html_nodes("table.wikitable.plainrowheaders") %>% #2010 and after
html_nodes(xpath="/html/body/div[3]/div[3]/div[4]/div/table[2]") %>% #Before 2010
html_table(fill=TRUE, header=1)
hot100_list<- append(hot100_list, hot100_chart) # Appending the table/songs to the overall list
}
P1.3: Get a list of unique top songs w/ artist names for a particular year
# Converting list to a dataframe and getting unique list of songs and then back into listhot100_df<-data.frame(hot100_list)[3:4]
hot100_df$Song.Artist<- paste(hot100_df$Song, hot100_df$Artist.s.)
hot100_songs_df<- unique(data.frame(hot100_df)[3]) # Getting unique list of songs and artistshot100_songs_list<- c()
for (iinhot100_songs_df){
hot100_songs_list<- str_replace_all(i, "[:punct:]", '') # Removing all punctuation from the list# hot100_songs_list <- str_replace_all(i, '"', '')
}
Songs2018<-hot100_songs_list# Saving song lists for an individual yearSongs2018
## [1] "Let Me Love You Mario"
## [2] "Candy Shop 50 Cent featuring Olivia"
## [3] "Hollaback Girl Gwen Stefani"
## [4] "We Belong Together Mariah Carey"
## [5] "Inside Your Heaven Carrie Underwood"
## [6] "Gold Digger Kanye West featuring Jamie Foxx"
## [7] "Run It Chris Brown"
## [8] "Dont Forget About Us Mariah Carey"
P2: Genius
P2.1 Using the cleaned list of songs and artists to get lyrics for each song
#Take Dataframe and pass song name into genius.com, Using RSelenium to access Genius.com lyrics for each songdriver<- rsDriver(browser= c("firefox"))
remote_driver<-driver[["client"]]
remote_driver$open()
# Looping through each song name and getting lyrics for each songlyrics_list<- c()
lyrics_out<- c()
# hot100_songs_list <- Songs2018for(iin1:length(hot100_songs_list)){
remote_driver$navigate("https://genius.com")
remote_driver$refresh() #Refresh to home page
Sys.sleep(2)
address_element<-remote_driver$findElement(using='xpath', value='/html/body/div/div/div[1]/form/input')
address_element$sendKeysToElement(list(hot100_songs_list[i]))
Sys.sleep(2)
button_element<-remote_driver$findElement(using='xpath', value="/html/body/div/div/div[1]/form/div[2]")
button_element$clickElement()
Sys.sleep(2)
button_element2<-remote_driver$findElement(using='class', value="mini_card")
button_element2$clickElement()
Sys.sleep(2)
lyrics_out<-remote_driver$findElement(using="xpath", value="/html/body/routable-page/ng-outlet/song-page/div/div/div[2]/div[1]/div/defer-compile[1]/lyrics/div/div/section")
Sys.sleep(2)
lyrics_list_text<-lyrics_out$getElementText()
lyrics_list<- append(lyrics_list, lyrics_list_text)
#lyrics_list <- lyrics_list[-c(1)]
}
#driver$server$stop() # Drops the connection to the server#Write Lyrics to CSV or text file# write.csv(lyrics_list, file = "test1980lyrics.csv")# lyrics1980 <- lyrics_list# lyrics1980 <- lyrics1980[-c(6,7)]# lyrics1985 <- lyrics1985[-c(8,16)]# lyrics1990 <- lyrics1990[-c(3,12)]# lyrics2015 <- lyrics2015[-c(2)]
P3: Visualizations and Analysis
P3.1 Preparing the Lyrics for Visualizations
# Preparing the Lyrics for Visualizations# text <- read.csv(file = '/Users/jacobmannix/Desktop/test1980lyrics.csv')docs<- Corpus(VectorSource(lyrics_list))
# inspect(docs)# Cleaning up the docstoSpace<- content_transformer(function (x , pattern ) gsub(pattern, "", x))
docs<- tm_map(docs, toSpace, "/")
docs<- tm_map(docs, toSpace, "@")
docs<- tm_map(docs, toSpace, "\\|")
# Cleaning up the docs furtherdocs<- tm_map(docs, content_transformer(tolower)) #to lower casedocs<- tm_map(docs, removeNumbers) # Remove numbersdocs<- tm_map(docs, removeWords, stopwords("english")) # Remove english common stopwordsdocs<- tm_map(docs, removePunctuation) # Remove punctuationsdocs<- tm_map(docs, stripWhitespace) # Eliminate extra white spacesdocs<- tm_map(docs, removeWords, c("chorus", "verse"))
# docs <- tm_map(docs, removeWords, c()) # Remove your own stop word# Creating a Term Document Matrix to display most frequently used wordsdtm<- TermDocumentMatrix(docs)
m<- as.matrix(dtm)
v<- sort(rowSums(m),decreasing=TRUE)
d<-data.frame(word= names(v),freq=v)
head(d, 15)
## word freq
## baby baby 52
## aint aint 47
## shit shit 39
## let let 35
## can can 33
## girl girl 33
## know know 32
## like like 31
## gonna gonna 28
## run run 25
## love love 24
## got got 22
## ooh ooh 22
## just just 21
## wanna wanna 21