I have a text file with words from historical accounts and I want to visualise the species and frequency of words associated with them.
So far I have tried using the following code with a txt file of all the historical documents in one doc but want to ask if there is specific formatting of a csv to then input into R for a bipartite network graph:
"""library(ggraph)
library(ggplot2)
library(dplyr)
library(pdftools)
library(tm)
library(readtext)
library(tidytext)
library(igraph)
library(tidyr)
library(FactoMineR)
library(factoextra)
library(flextable)
library(GGally)
library(ggdendro)
library(network)
library(Matrix)
library(quanteda)
library(stringr)
library(quanteda.textstats)
options(stringsAsFactors = F)
options(scipen = 999)
options(max.print=1000)
# Read in text--------
wordbase <- readtext("mq_bird_stories.txt")
# List of extra words to remove---------
extrawords <- c("the", "can", "get", "Ccchants", "make", "making", "house", "torn", "tree", "man", "however", "upon", "instructs", "wife", "coming","without", "mother", "versions","variant", "version", "thus", "got","throws", "are", "has", "already", "asks", "sacra", "can", "brings", "one", "look", "sees", "tonaheiee", "wants", "later",
"dont", "even", "may", "but", "will", "turn", "sing", "swallows", "alba", "gives", "find", "other","tonaheieee", "away","day","comes","another",
"much", "first", "but", "see", "new", "back","goes", "go","songs", "returns", "take","takes","come",
"many", "less", "now", "well", "taught", "like", "puts", "slits", "sends", "tell","tells","open","mentions",
"often", "every", "said", "two", "and", "handsome", "husband", "bring", "lives","gets", "von", "den", "steinen", "handy")
# Clean the data-------
darwin <- wordbase %>%
paste0(collapse = " ") %>%
stringr::str_squish() %>%
stringr::str_remove_all("\\(") %>%
stringr::str_remove_all("\\)") %>%
stringr::str_remove_all("!") %>%
stringr::str_remove_all(",") %>%
stringr::str_remove_all(";") %>%
stringr::str_remove_all("\\?") %>%
stringr::str_split(fixed(".")) %>%
unlist() %>%
tm :: removeWords(extrawords) %>%
paste0(collapse = " ")
# One method for calculating frequencies of bigrams------
# Process into a table of words
darwin_split <- darwin %>%
as_tibble() %>%
tidytext::unnest_tokens(words, value)
# Create data frame of bigrams-------
darwin_words <- darwin_split %>%
dplyr::rename(word1 = words) %>%
dplyr::mutate(word2 = c(word1[2:length(word1)], NA)) %>%
na.omit()
# Calculate frequency of bigrams-----
darwin2grams <- darwin_words %>%
dplyr::mutate(bigram = paste(word1, word2, sep = " ")) %>%
dplyr::group_by(bigram) %>%
dplyr::summarise(frequency = n()) %>%
dplyr::arrange(-frequency)
# Define stopwords
stps <- paste0(tm::stopwords(kind = "en"), collapse = "\\b|\\b")
# Remove stopwords from bigram table
darwin2grams_clean <- darwin2grams %>%
dplyr::filter(!str_detect(bigram, stps))
# Another method for calculating frequencies of bigrams
# Clean corpus
darwin_clean <- darwin %>%
stringr::str_to_title()
# Tokenize corpus----
darwin_tokzd <- quanteda::tokens(darwin_clean)
# Extract bigrams------
BiGrams <- darwin_tokzd %>%
quanteda::tokens_remove(stopwords("en")) %>%
quanteda::tokens_select(pattern = "^[A-Z]",
valuetype = "regex",
case_insensitive = FALSE,
padding = TRUE) %>%
quanteda.textstats::textstat_collocations(min_count = 1, tolower = FALSE)
# read in and process text
darwinsentences <- darwin %>%
stringr::str_squish() %>%
tokenizers::tokenize_sentences(.) %>%
unlist() %>%
stringr::str_remove_all("- ") %>%
stringr::str_replace_all("\\W", " ") %>%
stringr::str_squish()
# inspect data
head(darwinsentences)
darwincorpus <- Corpus(VectorSource(darwinsentences))
# clean corpus-----
darwincorpusclean <- darwincorpus %>%
tm::tm_map(removeNumbers) %>%
tm::tm_map(tolower) %>%
tm::tm_map(removeWords, stopwords()) %>%
tm::tm_map(removeWords, extrawords)
# create document term matrix
darwindtm <- DocumentTermMatrix(darwincorpusclean, control=list(bounds = list(global=c(1, Inf)), weighting = weightBin))
# convert dtm into sparse matrix
darwinsdtm <- Matrix::sparseMatrix(i = darwindtm$i, j = darwindtm$j,
x = darwindtm$v,
dims = c(darwindtm$nrow, darwindtm$ncol),
dimnames = dimnames(darwindtm))
# calculate co-occurrence counts
coocurrences <- t(darwinsdtm) %*% darwinsdtm
# convert into matrix
collocates <- as.matrix(coocurrences)
# inspect size of matrix
ncol(collocates)
#provide some summary stats
summary(rowSums(collocates))
#visualising collocations
# load function for co-occurrence calculation
source("https://slcladal.github.io/rscripts/calculateCoocStatistics.R")
# define term
coocTerm <- "pigeon"
# calculate co-occurrence statistics
coocs <- calculateCoocStatistics(coocTerm, darwinsdtm, measure="LOGLIK")
# inspect results
coocs[1:50]
coocdf <- coocs %>%
as.data.frame() %>%
dplyr::mutate(CollStrength = coocs,
Term = names(coocs)) %>%
dplyr::filter(CollStrength > 0)
###Make graph - visualize association strengths------
ggplot(coocdf, aes(x = reorder(Term, CollStrength, mean), y = CollStrength)) +
geom_point() +
coord_flip() +
theme_bw() +
labs(y = "")
##network
net = network::network(collocates_redux,
directed = FALSE,
ignore.eval = FALSE,
names.eval = "weights")
# vertex names
network.vertex.names(net) = rownames(collocates_redux)
# inspect object
net
ggnet2(net,label = TRUE,
label.size = 4,
alpha = 0.2,
size.cut = 3,
edge.alpha = 0.3) +
guides(color = FALSE, size = FALSE)"""
I'd suggest taking a look at the netCoin package. If you can transform your data into nodes and links data frames, then you can easily get a high quality network visualization:
#Example of links data frame
links <-
data.frame(
matrix(
c(
"Person A","Account 1", "not link",
"Person A","Account 2", "link",
"Person B","Account 2", "link",
"Person B","Account 3", "not link",
"Person B","Account 4", "link",
"Person C","Account 4", "link"
),
nrow = 6,
ncol = 3,
byrow = TRUE,
dimnames = list(NULL,
c("Source", "Target", "other_links_column"))
),
stringsAsFactors = FALSE
)
#Example of nodes data frame
nodes <-
data.frame(
matrix(
c(
"Person A","person",
"Person B","person",
"Person C","person",
"Account 1", "account",
"Account 2", "account",
"Account 3", "account",
"Account 4", "account"
),
nrow = 7,
ncol = 2,
byrow = TRUE,
dimnames = list(NULL,
c("name", "other_nodes_column"))
),
stringsAsFactors = FALSE
)
install.packages("netCoin") #may need to install the netCoin package
library(netCoin)
?netCoin #displays netCoin Help to see all the function options
graph_df <- netCoin(nodes = nodes, #Data frame of unique nodes and their attributes #Must contain name column
links = links, #Data frame of links and their attributes #Must contain Source and Target columns
cex = 1.25, #Font size
color = "other_nodes_column", #Column in node data frame to determine node color
shape = "other_nodes_column", #Column in node data frame to determine node shape
main = "This is the title of my visualization", #Visualization title
controls = 1:5, #Controls that will be shown in the visualization (maximum of 5)
dir = "folder-with-viz-output") #Output folder for the visualization #Entire folder should be exported as a zip file
plot(graph_df) #Command to display the visualization
I'm just getting started in R and I'm trying to wrap my head around barplot for a university assignment. Specifically, I am using the General Social Survey 2018 dataset (for codebook: https://www.thearda.com/Archive/Files/Codebooks/GSS2018_CB.asp) and I am trying to figure out if religion has any effect on the way people seek out help for mental health. I want to use reliten (self-assessment of religiousness - from strong to no religion) as the IV and tlkclrgy, (asks if a person with mental health issues should reach out to a religious leader - yes or no) as the DV. For a better visualization of the data, I want to create a side-by-side barplot with reliten on the x-axis and see how many people answered yes and no on tlkclrgy. My problem is that on the barplot I get numbers instead of categories (from strong to no religion). This is what I tried, but I keep getting NA on the x-axis:
GSS$reliten <- factor(as.character(GSS$reliten),
levels = c("No religion", "Somewhat
strong", "Not very strong",
"Strong"))
GSS <- GSS18[!GSS18$tlkclrgy %in% c(0, 8, 9),]
GSS$reliten <- as_factor(GSS$reliten)
GSS$tlkclrgy <- as_factor(GSS$tlkclrgy)
ggplot(data=GSS,mapping=aes(x=reliten,fill=tlkclrgy))+
geom_bar(position="dodge")
Does anybody have any tips?
Here is complete code to download the codebook and data, table the two columns of interest and plot the frequencies.
1. Read the data
Data will be downloaded to a temporary directory, to keep my disk palatable. Use of these first two instructions is optional
od <- getwd()
setwd("~/Temp")
These are the links to the two files that need to be read and the filenames.
cols_url <- "https://osf.io/ydxu4/download"
cols_file <- "General Social Survey, 2018.col"
data_url <- "https://osf.io/e76rv/download"
data_file <- "General Social Survey, 2018.dat"
download.file(cols_url, cols_file, mode = "wb")
download.file(data_url, data_file, mode = "wb")
Now read in the codebook and process it, extracting the column widths and column names.
cols <- readLines(cols_file)
cols <- strsplit(cols, ": ")
widths_char <- sapply(cols, '[', 2)
i_widths <- grepl("-", widths_char)
f <- function(x) -eval(parse(text = x)) + 1L
widths <- rep(1L, length(widths_char))
widths[i_widths] <- f(widths[i_widths])
col_names <- sapply(cols, '[', 1)
col_names <- trimws(sub("^.[^ ]* ", "", col_names))
col_names <- tolower(col_names)
Finally, read the fixed width text file.
df1 <- read.fwf(data_file, widths = widths, header = FALSE, na.strings = "-", col.names = col_names)
2. Table the data
Find out where are the two columns we want with grep.
i_cols <- c(
grep("reliten", col_names, ignore.case = TRUE),
grep("tlkclrgy", col_names, ignore.case = TRUE)
)
head(df1[i_cols])
Table those columns and coerce to data.frame. Then coerce the columns to factor.
Here there is a problem, there is no answer 3 for tlkclrgy in the published survey but there are answers 3 in the data file. So I have created an extra factor level.
GSS <- as.data.frame(table(df1[i_cols]))
labels_reliten <- c(
"Not applicable",
"Strong",
"Not very strong",
"Somewhat Strong",
"No religion",
"Don't know",
"No answer"
)
levels_reliten <- c(0, 1, 2, 3, 4, 8, 9)
labels_tlkclrgy <- c(
"Not applicable",
"Yes",
"No",
"Not in codebook",
"Don't know",
"No answer"
)
levels_tlkclrgy <- c(0, 1, 2, 3, 8, 9)
GSS$reliten <- factor(
GSS$reliten,
labels = labels_reliten,
levels = levels_reliten
)
GSS$tlkclrgy <- factor(
GSS$tlkclrgy,
labels = labels_tlkclrgy,
levels = levels_tlkclrgy
)
3. Plot the frequencies table
library(ggplot2)
ggplot(data = GSS, mapping = aes(x = reliten, y = Freq, fill = tlkclrgy)) +
geom_col(position = "dodge")
I was wondering if it was possible to use subset on a geom_polyfreq()?
I am running a topic model and in order to report the facets properly i want to remove 4 out of 10 facets.
My code is as follows:
ggplot(data = dat,
aes(x = date,
fill = Topics)) +
geom_freqpoly(binwidth = 3) +
labs(x = "",
y = "Topic Count",
title = "Mentions of Topic On a Monthly Basis")+
scale_x_date(date_breaks = "months", date_labels="%b")+
theme(text = element_text(size=8)) +
theme(axis.text.x = element_text(angle = 45))+
facet_wrap(Topics ~ ., scales = "free")
> ggplot(subset(dat, Topics %in% c(3, 4, 5, 7, 8, 9)),
aes(x = date,
fill = topic)) +
geom_freqpoly(binwidth = 3) +
labs(x = "",
y = "Topic Count",
title = "Mentions of Topic On a Monthly Basis")+
scale_x_date(date_breaks = "months", date_labels="%b")+
theme(text = element_text(size=9)) +
theme(axis.text.x = element_text(angle = 45))+
facet_wrap(Topics ~ ., scales = "free")
However, when I try to subset the data, I get an error that says:
Fejl: Faceting variables must have at least one value
Does anybody know what the issue is?
I hope this makes sense.
The full code is down below.
article.data <- article.data[!is.na(article.data$fulltext), ]
## Get date
article.data$date <- as.Date(article.data$date, "%Y-%m-%d")
#all of 2018
dat <- article.data[article.data$date > as.Date("2018-01-01", "%Y-%m-%d") &
article.data$date < as.Date("2018-12-01", "%Y-%m-%d"), ]
## 'tokenize' fulltext
quanteda_options("language_stemmer" = "danish")
texts <- gsub(":", " ", dat$fulltext, fixed = T)
texts <- tokens(texts, what = "word",
remove_numbers = T,
remove_punct = T,
remove_symbols = T,
remove_separators = T,
remove_hyphens = T,
remove_url = T,
verbose = T)
texts <- tokens_tolower(texts)
texts <- tokens_remove(texts, stopwords("danish"))
texts <- tokens_wordstem(texts)
texts <- tokens_remove(texts, stopwords("danish"))
# get actual dfm from tokens
txt.mat <- dfm(texts)
#remove frequent words with no substance
txt.mat <- txt.mat %>% dfm_remove(c("ad",
"af","aldrig","alene","alle",
"allerede","alligevel","alt",
"altid","anden","andet","andre",
"at","bag","bare", "bedre", "begge","bl.a.",
"blandt", "blev", "blevet", "blive","bliver",
"burde", "bør","ca.", "com", "da",
"dag", "dansk", "danske", "de",
"dem", "den", "denne","dens",
"der","derefter","deres","derfor",
"derfra","deri","dermed","derpå",
"derved","det","dette","dig",
"din","dine","disse","dit",
"dog","du","efter","egen",
"ej","eller","ellers","en",
"end","endnu","ene","eneste","enhver","ens",
"enten","er","et","f.eks.","far","fem",
"fik","fire","flere","flest",
"fleste","for", "foran",
"fordi","forrige","fra", "fx",
"få","får","før","først",
"gennem","gjorde","gjort","god",
"godt","gør","gøre","gørende",
"ham","han","hans","har",
"havde","have","hej","hel",
"heller","helt","hen","hende",
"hendes","henover","her",
"herefter","heri","hermed",
"herpå","hos","hun","hvad",
"hvem","hver","hvilke","hvilken",
"hvilkes","hvis",
"hvor", "hvordan","hvorefter","hvorfor",
"hvorfra","hvorhen","hvori","hvorimod",
"hvornår","hvorved","i", "ifølge", "igen",
"igennem","ikke","imellem","imens",
"imod","ind","indtil","ingen",
"intet","ja","jeg","jer","jeres",
"jo","kan","kom","komme",
"kommer", "kroner", "kun","kunne","lad",
"langs", "lang", "langt", "lav","lave","lavet",
"lidt","lige","ligesom","lille",
"længere","man","mand","mange",
"med","meget","mellem","men", "mener",
"mens","mere","mest","mig",
"min","mindre","mindst","mine",
"mit","mod","må","måske",
"ned","nej","nemlig","ni",
"nogen","nogensinde","noget",
"nogle","nok","nu","ny", "nye",
"nyt","når","nær","næste",
"næsten","og","også","okay",
"om","omkring","op","os",
"otte","over","overalt","pga.", "partier",
"partiets", "partiers", "politiske",
"procent", "på", "ritzau", "samme",
"sammen","se","seks","selv","selvom",
"senere","ser","ses","siden","sig",
"sige", "siger", "sin","sine","sit",
"skal","skulle","som","stadig",
"stor","store","synes","syntes",
"syv","så","sådan","således",
"tag","tage","temmelig","thi",
"ti","tidligere","til","tilbage",
"tit","to","tre","ud","uden",
"udover","under","undtagen","var",
"ved","vi","via","vil","ville", "viser",
"vor","vore","vores","vær","være",
"været","øvrigt","facebook","http", "https",
"www","millioner", "frem", "lars", "lars_løkke",
"rasmussen", "løkke_rasmussen", "statsminister", "politik",
"formand", "partiet", "år", "tid", "and", "fler",
"sid", "regeringen", "giv", "politisk", "folketing", "mer",
"ifølg"))
############################################################
## FEATURE SELECTION
############################################################
# check out top-appearing features in dfm
topfeatures(txt.mat)
# keep features (words) appearing in >2 documents
txt.mat <- dfm_trim(txt.mat, min_termfreq = 4)
# filter out one-character words
txt.mat <- txt.mat[, str_length(colnames(txt.mat)) > 2]
# filter out some html trash features
#txt.mat <- txt.mat[, !grepl("[[:digit:]]+px", colnames(txt.mat))]
#txt.mat <- txt.mat[, !grepl(".", colnames(txt.mat), fixed = T)]
#txt.mat <- txt.mat[, !grepl("_", colnames(txt.mat), fixed = T)]
#txt.mat <- txt.mat[, !grepl("#", colnames(txt.mat), fixed = T)]
#txt.mat <- txt.mat[, !grepl("#", colnames(txt.mat), fixed = T)]
############################################################
## SELECT FEATURES BY TF-IDF
############################################################
# Create tf_idf-weighted dfm
ti <- dfm_tfidf(txt.mat)
# Select from main dfm using its top features
txt.mat <- dfm_keep(txt.mat, names(topfeatures(ti, n = 1000)))
############################################################
## RUN TOPIC MODEL
############################################################
# convert quanteda dfm to tm 'dtm'
dtm <- convert(txt.mat, to = "topicmodels")
# run lda with 8 topics
lda <- LDA(dtm, k = 8)
# review terms by topic
terms(lda, 10)
############################################################
## LOOK FOR 'OPTIMAL' k
############################################################
# randomly sample test data
set.seed(61218)
select <- sample(1:nrow(dtm), size = 100)
test <- dtm[select, ]
train <- dtm[!(1:nrow(dtm) %in% select), ]
n.tops <- 3:14
metrics <- data.frame(topics = n.tops,
perplexity = NA)
for(i in n.tops) { # NB: takes awhile to run
print(i)
est <- LDA(train, k = i)
metrics[(i - 1), "perplexity"] <- perplexity(est, newdata = test)
}
save(metrics, file = "lda_perplexity2018.RData")
qplot(data = metrics, x = topics, y = perplexity, geom = "line",
xlab = "Number of topics",
ylab = "Perplexity on test data") + theme_bw()
#We found that 8 topics was one of those of lowest perplexity but
#also the ones which made the most sense
############################################################
## RERUN WITH BETTER CHOICE OF k
############################################################
# run lda with 10 topics
lda <- LDA(dtm, k = 10)
save(lda, file = "dr_ft_keep2018.RData")
# examine output
terms(lda, 20)
# put topics into original data
dat$topic <- topics(lda)
# add labels
#dat$date <- factor(dat$date,
#levels = 1:12,
#labels = c("januar","februar", "marts","april", "maj", "juni", "juli", "august", "september", "oktober", "november", "decemeber"))
dat$Topics <- factor(dat$topic,
levels = 1:10,
labels = c("Topc 1", "Topic 2", "Integration", "Taxation", "Burka Prohibition",
"Topic 6", "Justice", "Foreign Affairs", "Housing", "Topic 10"))
# frequency
qplot(data = dat, x = Topics,
geom = "bar", xlab = "",
ylab = "Topic Frequency", fill=Topics, main = "Figure 1: Main Topics in 2018 - DR") +
theme_bw() +
theme(axis.text.x = element_text(angle = 90))
#Make visualization showing topics over time
ggplot(data = dat,
aes(x = date,
fill = Topics[1])) +
geom_freqpoly(binwidth = 30) +
facet_wrap(Topics ~ ., scales = "free")+
theme_classic() +
scale_x_date(breaks = as.Date(c( "2018-02-01", "2018-04-01", "2018-06-01", "2018-08-01", "2018-10-01", "2018-12-01", date_labels="%B"))) +
theme(axis.text.x = element_text(angle = 90))
ggplot(data = dat,
aes(x = date,
fill = Topics)) +
geom_freqpoly(binwidth = 3) +
labs(x = "",
y = "Topic Count",
title = "Mentions of Topic On a Monthly Basis")+
scale_x_date(date_breaks = "months", date_labels="%b")+
theme(text = element_text(size=8)) +
theme(axis.text.x = element_text(angle = 45))+
facet_wrap(Topics ~ ., scales = "free")
It's best practice on this forum to make your question reproducible, so that others can try it and test their solutions to confirm they work. It's also good if you can make it minimal, both to respect potential answerers' time and to help clarify your own understanding of the problem.
How to make a great R reproducible example
In this case, the error message suggests that your subsetting is removing all your data, which breaks the faceting. It can't plot any facets if the faceting variable has no values.
It looks like dat$Topics is a factor, but your loop is referring to Topics like they're numeric with Topics %in% c(3, 4, 5, 7, 8, 9). For example, I could define a factor vector with the same levels as your Topics variable:
Topics <- factor(1:10, levels = 1:10,
labels = c("Topc 1", "Topic 2", "Integration", "Taxation", "Burka Prohibition",
"Topic 6", "Justice", "Foreign Affairs", "Housing", "Topic 10"))
Compare the output of these three lines:
Topics %in% c(1, 2)
# [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
as.numeric(Topics) %in% c(1, 2)
# [1] TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
Topics %in% c("Topc 1", "Topic 2")
# [1] TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
In the top case, none of the data matches the test, so using that to subset the data would give you an empty data set and seems like a plausible cause for the error you got.
To refer to the Topics by their underlying level, we can refer to as.numeric(Topics) %in% c(1, 2). If we want to refer to the Topics by their labels, I could use Topics %in% c("Topc 1", "Topic 2").
Since I don't have your data, I can't confirm this exact syntax will work for you, but I hope something along these lines will.
For more on how to work with factors in R, I recommend: https://r4ds.had.co.nz/factors.html