## ----include = FALSE---------------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) ## ----data, eval=FALSE--------------------------------------------------------- # # library(rvest) # library(dplyr) # library(purrr) # library(tibble) # library(stringr) # # class_of_interest <- ".mw-content-ltr" ## ids are #id-name, classes are .class-name # # # Finding newest 150 versions of Wikipedia's highlighter article # editurl <- "https://en.wikipedia.org/w/index.php?title=Highlighter&action=history&offset=&limit=150" # editclass_of_interest <- ".mw-changeslist-date" # # # Save the urls of the full articles # url_list1 <- editurl %>% # rvest::read_html() %>% # rvest::html_nodes(editclass_of_interest) %>% # purrr::map(., list()) %>% # tibble::tibble(node = .) %>% # dplyr::mutate(link = purrr::map_chr(node, html_attr, "href") %>% paste0("https://en.wikipedia.org", .)) # # # Finding oldest 150 versions of Wikipedia's highlighter article # editurl2 <- "https://en.wikipedia.org/w/index.php?title=Highlighter&action=history&dir=prev&limit=150" # # # Save the urls of the full articles # url_list2 <- editurl2 %>% # rvest::read_html() %>% # rvest::html_nodes(editclass_of_interest) %>% # purrr::map(., list()) %>% # tibble::tibble(node = .) %>% # dplyr::mutate(link = purrr::map_chr(node, html_attr, "href") %>% paste0("https://en.wikipedia.org", .)) # # # Combine url list # url_list <- rbind(url_list1, url_list2) # # # create a data frame with the text of the documents # wiki_pages <- data.frame(page_notes = rep(NA, dim(url_list)[1])) # # for (i in 1:dim(url_list)[1]){ # # wiki_list <- url_list$link[i] %>% # rvest::read_html() %>% # rvest::html_node(class_of_interest) %>% # rvest::html_children() %>% # purrr::map(., list()) %>% # tibble::tibble(node = .) %>% # dplyr::mutate(type = purrr::map_chr(node, html_name)) %>% # dplyr::filter(type == "p") %>% # dplyr::mutate(text = purrr::map_chr(node, html_text)) %>% # dplyr::mutate(cleantext = stringr::str_remove_all(text, "\\[.*?\\]") %>% stringr::str_trim()) %>% # plyr::summarise(cleantext = paste(cleantext, collapse = "
")) # # wiki_pages$page_notes[i] <- wiki_list$cleantext[1] # # } # ## ----setup-------------------------------------------------------------------- library(highlightr) # tokenize comments toks_comment <- token_comments(highlightr::wiki_pages) ## ----------------------------------------------------------------------------- # find most recent version of the article and save text to a data frame transcript_example_rename <- data.frame(text=wiki_pages[1,]) # tokenize most recent version of the article (as the reference) toks_transcript <- token_transcript(transcript_example_rename) ## ----------------------------------------------------------------------------- # use fuzzy matching to calculate weighted frequency values between derivative and source documents collocation_object <- collocate_comments_fuzzy(toks_transcript, toks_comment) head(collocation_object) ## ----------------------------------------------------------------------------- # connect collocation frequencies to source document merged_frequency <- transcript_frequency(transcript_example_rename, collocation_object) # create a ggplot object of the transcript freq_plot <- collocation_plot(merged_frequency) # add html tags to source document page_highlight <- highlighted_text(freq_plot, labels=c("(fewest articles)", "(most articles)")) ## ----------------------------------------------------------------------------- # separate the oldest version of the article transcript_example_rename2 <- data.frame(text=wiki_pages[dim(wiki_pages)[1],]) # tokenize the transcript toks_transcript2 <- token_transcript(transcript_example_rename2) # use fuzzy collocation on the source and derivative documents collocation_object2 <- collocate_comments_fuzzy(toks_transcript2, toks_comment) # connect collocation frequencies to source document merged_frequency2 <- transcript_frequency(transcript_example_rename2, collocation_object2) # create a gpplot object of the transcript freq_plot2 <- collocation_plot(merged_frequency2) # add html tags to source document page_highlight2 <- highlighted_text(freq_plot2, labels=c("(fewest articles)", "(most articles)"))