I have a list of lists I need to delete "gene-" everywhere where it happens.
I tried
lapply(net, FUN = function(x) setNames(x, sub("gene-","", x)))
but I get the error
Error in names(object) <- nm : attempt to set an attribute on NULL
head(net)
$colors
gene-AAAS gene-AAK1 gene-AAMDC gene-AAMP gene-AARS1 gene-AASDH
"magenta" "brown" "purple" "darkgrey" "brown" "blue"
gene-AASDHPPT gene-AASS gene-AATK gene-ABAT
[ reached getOption("max.print") -- omitted 8990 entries ]
$unmergedColors
gene-AAAS gene-AAK1 gene-AAMDC gene-AAMP gene-AARS1 gene-AASDH
"darkgrey" "blue" "magenta" "darkolivegreen" "blue" "brown"
gene-AASDHPPT gene-AASS gene-AATK gene-ABAT gene-ABCA1 gene-ABCA12
"lightyellow" "lightgreen" "turquoise" "darkred" "turquoise" "grey60"
[ reached getOption("max.print") -- omitted 8990 entries ]
$MEs
MEblack MEgreenyellow MElightcyan MEyellow MEturquoise MEpink MEwhite MEdarkred
M5 -0.17423916 0.141440817 0.23401244 0.36358728 -0.0220835 -0.18126013 0.05942248 -0.45035371
N3 0.47690393 0.428961135 0.07241255 -0.02557197 0.2238352 0.06742087 -0.09574663 0.52201599
$goodSamples
[1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
$goodGenes
[1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[ reached getOption("max.print") -- omitted 8990 entries ]
$dendrograms
$dendrograms[[1]]
Call:
fastcluster::hclust(d = as.dist(dissTom), method = "average")
Cluster method : average
Number of objects: 9990
dput(net)
166L, 5768L, 2346L, 7132L, 625L, 4848L, 736L, 7001L,
1721L, 6626L, 7674L, 2543L, 7013L, 8667L, 4593L, 2804L,
....
7435L, 4895L, 8462L, 1732L, 3160L, 8529L), labels = NULL,
method = "average", call = fastcluster::hclust(d = as.dist(dissTom),
method = "average"), dist.method = NULL), class = "hclust")),
TOMFiles = NULL, blockGenes = list(1:9990), blocks = c(1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
...
1, 1), MEsOK = TRUE)
Your code almost works, you need two changes:
You want to modify the names, so your inner function needs to read names(x):
function(x) setNames(x, sub("gene-", "", names(x)))
net contains a lot of members. You only want to replace the colors and unmergedColors members, so apply your function to only those. Oh, and you need to assign the result back to your object:
which = c("colors", "unmergedColors")
net[which] = lapply(net[which], function(x) setNames(x, sub("gene-", "", names(x))))
I was wondering if it was possible to use subset on a geom_polyfreq()?
I am running a topic model and in order to report the facets properly i want to remove 4 out of 10 facets.
My code is as follows:
ggplot(data = dat,
aes(x = date,
fill = Topics)) +
geom_freqpoly(binwidth = 3) +
labs(x = "",
y = "Topic Count",
title = "Mentions of Topic On a Monthly Basis")+
scale_x_date(date_breaks = "months", date_labels="%b")+
theme(text = element_text(size=8)) +
theme(axis.text.x = element_text(angle = 45))+
facet_wrap(Topics ~ ., scales = "free")
> ggplot(subset(dat, Topics %in% c(3, 4, 5, 7, 8, 9)),
aes(x = date,
fill = topic)) +
geom_freqpoly(binwidth = 3) +
labs(x = "",
y = "Topic Count",
title = "Mentions of Topic On a Monthly Basis")+
scale_x_date(date_breaks = "months", date_labels="%b")+
theme(text = element_text(size=9)) +
theme(axis.text.x = element_text(angle = 45))+
facet_wrap(Topics ~ ., scales = "free")
However, when I try to subset the data, I get an error that says:
Fejl: Faceting variables must have at least one value
Does anybody know what the issue is?
I hope this makes sense.
The full code is down below.
article.data <- article.data[!is.na(article.data$fulltext), ]
## Get date
article.data$date <- as.Date(article.data$date, "%Y-%m-%d")
#all of 2018
dat <- article.data[article.data$date > as.Date("2018-01-01", "%Y-%m-%d") &
article.data$date < as.Date("2018-12-01", "%Y-%m-%d"), ]
## 'tokenize' fulltext
quanteda_options("language_stemmer" = "danish")
texts <- gsub(":", " ", dat$fulltext, fixed = T)
texts <- tokens(texts, what = "word",
remove_numbers = T,
remove_punct = T,
remove_symbols = T,
remove_separators = T,
remove_hyphens = T,
remove_url = T,
verbose = T)
texts <- tokens_tolower(texts)
texts <- tokens_remove(texts, stopwords("danish"))
texts <- tokens_wordstem(texts)
texts <- tokens_remove(texts, stopwords("danish"))
# get actual dfm from tokens
txt.mat <- dfm(texts)
#remove frequent words with no substance
txt.mat <- txt.mat %>% dfm_remove(c("ad",
"af","aldrig","alene","alle",
"allerede","alligevel","alt",
"altid","anden","andet","andre",
"at","bag","bare", "bedre", "begge","bl.a.",
"blandt", "blev", "blevet", "blive","bliver",
"burde", "bør","ca.", "com", "da",
"dag", "dansk", "danske", "de",
"dem", "den", "denne","dens",
"der","derefter","deres","derfor",
"derfra","deri","dermed","derpå",
"derved","det","dette","dig",
"din","dine","disse","dit",
"dog","du","efter","egen",
"ej","eller","ellers","en",
"end","endnu","ene","eneste","enhver","ens",
"enten","er","et","f.eks.","far","fem",
"fik","fire","flere","flest",
"fleste","for", "foran",
"fordi","forrige","fra", "fx",
"få","får","før","først",
"gennem","gjorde","gjort","god",
"godt","gør","gøre","gørende",
"ham","han","hans","har",
"havde","have","hej","hel",
"heller","helt","hen","hende",
"hendes","henover","her",
"herefter","heri","hermed",
"herpå","hos","hun","hvad",
"hvem","hver","hvilke","hvilken",
"hvilkes","hvis",
"hvor", "hvordan","hvorefter","hvorfor",
"hvorfra","hvorhen","hvori","hvorimod",
"hvornår","hvorved","i", "ifølge", "igen",
"igennem","ikke","imellem","imens",
"imod","ind","indtil","ingen",
"intet","ja","jeg","jer","jeres",
"jo","kan","kom","komme",
"kommer", "kroner", "kun","kunne","lad",
"langs", "lang", "langt", "lav","lave","lavet",
"lidt","lige","ligesom","lille",
"længere","man","mand","mange",
"med","meget","mellem","men", "mener",
"mens","mere","mest","mig",
"min","mindre","mindst","mine",
"mit","mod","må","måske",
"ned","nej","nemlig","ni",
"nogen","nogensinde","noget",
"nogle","nok","nu","ny", "nye",
"nyt","når","nær","næste",
"næsten","og","også","okay",
"om","omkring","op","os",
"otte","over","overalt","pga.", "partier",
"partiets", "partiers", "politiske",
"procent", "på", "ritzau", "samme",
"sammen","se","seks","selv","selvom",
"senere","ser","ses","siden","sig",
"sige", "siger", "sin","sine","sit",
"skal","skulle","som","stadig",
"stor","store","synes","syntes",
"syv","så","sådan","således",
"tag","tage","temmelig","thi",
"ti","tidligere","til","tilbage",
"tit","to","tre","ud","uden",
"udover","under","undtagen","var",
"ved","vi","via","vil","ville", "viser",
"vor","vore","vores","vær","være",
"været","øvrigt","facebook","http", "https",
"www","millioner", "frem", "lars", "lars_løkke",
"rasmussen", "løkke_rasmussen", "statsminister", "politik",
"formand", "partiet", "år", "tid", "and", "fler",
"sid", "regeringen", "giv", "politisk", "folketing", "mer",
"ifølg"))
############################################################
## FEATURE SELECTION
############################################################
# check out top-appearing features in dfm
topfeatures(txt.mat)
# keep features (words) appearing in >2 documents
txt.mat <- dfm_trim(txt.mat, min_termfreq = 4)
# filter out one-character words
txt.mat <- txt.mat[, str_length(colnames(txt.mat)) > 2]
# filter out some html trash features
#txt.mat <- txt.mat[, !grepl("[[:digit:]]+px", colnames(txt.mat))]
#txt.mat <- txt.mat[, !grepl(".", colnames(txt.mat), fixed = T)]
#txt.mat <- txt.mat[, !grepl("_", colnames(txt.mat), fixed = T)]
#txt.mat <- txt.mat[, !grepl("#", colnames(txt.mat), fixed = T)]
#txt.mat <- txt.mat[, !grepl("#", colnames(txt.mat), fixed = T)]
############################################################
## SELECT FEATURES BY TF-IDF
############################################################
# Create tf_idf-weighted dfm
ti <- dfm_tfidf(txt.mat)
# Select from main dfm using its top features
txt.mat <- dfm_keep(txt.mat, names(topfeatures(ti, n = 1000)))
############################################################
## RUN TOPIC MODEL
############################################################
# convert quanteda dfm to tm 'dtm'
dtm <- convert(txt.mat, to = "topicmodels")
# run lda with 8 topics
lda <- LDA(dtm, k = 8)
# review terms by topic
terms(lda, 10)
############################################################
## LOOK FOR 'OPTIMAL' k
############################################################
# randomly sample test data
set.seed(61218)
select <- sample(1:nrow(dtm), size = 100)
test <- dtm[select, ]
train <- dtm[!(1:nrow(dtm) %in% select), ]
n.tops <- 3:14
metrics <- data.frame(topics = n.tops,
perplexity = NA)
for(i in n.tops) { # NB: takes awhile to run
print(i)
est <- LDA(train, k = i)
metrics[(i - 1), "perplexity"] <- perplexity(est, newdata = test)
}
save(metrics, file = "lda_perplexity2018.RData")
qplot(data = metrics, x = topics, y = perplexity, geom = "line",
xlab = "Number of topics",
ylab = "Perplexity on test data") + theme_bw()
#We found that 8 topics was one of those of lowest perplexity but
#also the ones which made the most sense
############################################################
## RERUN WITH BETTER CHOICE OF k
############################################################
# run lda with 10 topics
lda <- LDA(dtm, k = 10)
save(lda, file = "dr_ft_keep2018.RData")
# examine output
terms(lda, 20)
# put topics into original data
dat$topic <- topics(lda)
# add labels
#dat$date <- factor(dat$date,
#levels = 1:12,
#labels = c("januar","februar", "marts","april", "maj", "juni", "juli", "august", "september", "oktober", "november", "decemeber"))
dat$Topics <- factor(dat$topic,
levels = 1:10,
labels = c("Topc 1", "Topic 2", "Integration", "Taxation", "Burka Prohibition",
"Topic 6", "Justice", "Foreign Affairs", "Housing", "Topic 10"))
# frequency
qplot(data = dat, x = Topics,
geom = "bar", xlab = "",
ylab = "Topic Frequency", fill=Topics, main = "Figure 1: Main Topics in 2018 - DR") +
theme_bw() +
theme(axis.text.x = element_text(angle = 90))
#Make visualization showing topics over time
ggplot(data = dat,
aes(x = date,
fill = Topics[1])) +
geom_freqpoly(binwidth = 30) +
facet_wrap(Topics ~ ., scales = "free")+
theme_classic() +
scale_x_date(breaks = as.Date(c( "2018-02-01", "2018-04-01", "2018-06-01", "2018-08-01", "2018-10-01", "2018-12-01", date_labels="%B"))) +
theme(axis.text.x = element_text(angle = 90))
ggplot(data = dat,
aes(x = date,
fill = Topics)) +
geom_freqpoly(binwidth = 3) +
labs(x = "",
y = "Topic Count",
title = "Mentions of Topic On a Monthly Basis")+
scale_x_date(date_breaks = "months", date_labels="%b")+
theme(text = element_text(size=8)) +
theme(axis.text.x = element_text(angle = 45))+
facet_wrap(Topics ~ ., scales = "free")
It's best practice on this forum to make your question reproducible, so that others can try it and test their solutions to confirm they work. It's also good if you can make it minimal, both to respect potential answerers' time and to help clarify your own understanding of the problem.
How to make a great R reproducible example
In this case, the error message suggests that your subsetting is removing all your data, which breaks the faceting. It can't plot any facets if the faceting variable has no values.
It looks like dat$Topics is a factor, but your loop is referring to Topics like they're numeric with Topics %in% c(3, 4, 5, 7, 8, 9). For example, I could define a factor vector with the same levels as your Topics variable:
Topics <- factor(1:10, levels = 1:10,
labels = c("Topc 1", "Topic 2", "Integration", "Taxation", "Burka Prohibition",
"Topic 6", "Justice", "Foreign Affairs", "Housing", "Topic 10"))
Compare the output of these three lines:
Topics %in% c(1, 2)
# [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
as.numeric(Topics) %in% c(1, 2)
# [1] TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
Topics %in% c("Topc 1", "Topic 2")
# [1] TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
In the top case, none of the data matches the test, so using that to subset the data would give you an empty data set and seems like a plausible cause for the error you got.
To refer to the Topics by their underlying level, we can refer to as.numeric(Topics) %in% c(1, 2). If we want to refer to the Topics by their labels, I could use Topics %in% c("Topc 1", "Topic 2").
Since I don't have your data, I can't confirm this exact syntax will work for you, but I hope something along these lines will.
For more on how to work with factors in R, I recommend: https://r4ds.had.co.nz/factors.html
I have a character column with a different amount of values per row. This is just a small example:
GoodForMeal %>% head(5)
# A tibble: 5 x 1
GoodForMeal
<chr>
1 dessert': False, 'latenight': False, 'lunch': True, 'dinner': True
2 dessert': False, 'latenight': False, 'lunch': True, 'dinner': True
3 <NA>
4 dessert': False, 'latenight': False, 'lunch': True, 'dinner': True
5 dessert': False, 'latenight': False, 'lunch': True, 'dinner': True
Here is a dput() of the first row of the column:
structure(list(GoodForMeal = "dessert': False, 'latenight': False, 'lunch': True, 'dinner': True, 'breakfast': False, 'brunch': False}"), .Names = "GoodForMeal", row.names = c(NA,
-1L), class = c("tbl_df", "tbl", "data.frame"))
I want to assign the values before the colon as column names and the values after the colon as the values of the respective column.
Example:
desert latenight lunch diner
1 False False True True
2 False False True True
3 NA NA NA NA
4 False False True True
5 False False True True
I tried it with the tidyr packadge and the separate and the spread function:
separate(GoodForMeal, c("key", "value"), sep = ":", extra = "merge") %>% spread(key, value)
The problem is the r is not splitting all the values before the colon but just the first value.
So the result looks like this:
GoodForMeal %>% str()
Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 4464 obs. of 2 variables:
$ dessert': chr " False, 'latenight': False, 'lunch': True, 'dinner': False, 'breakfast': False, 'brunch': False}" " False, 'latenight': False, 'lunch': True, 'dinner': True, 'breakfast': False, 'brunch': False}" " False, 'latenight': False, 'lunch': False, 'dinner': False, 'breakfast': False, 'brunch': False}" " False, 'latenight': False, 'lunch': True, 'dinner': True, 'breakfast': False, 'brunch': False}" ...
$ <NA> : chr NA NA NA NA ...
Any Idea how to split the values so that it´s looking like in the example? THX
Working with the test data you've provided, I would use mutate first to rid the column of characters such ' and :, along with the meal time keywords. This allows you to split on the comma that separates the various meal times. The following is an illustration:
df <- structure(list(GoodForMeal = "dessert': False, 'latenight': False, 'lunch': True, 'dinner': True, 'breakfast': False, 'brunch': False}"),
.Names = "GoodForMeal", row.names = c(NA, -1L),
class = c("tbl_df", "tbl", "data.frame"))
df %>%
mutate(GoodForMeal = trimws(gsub("[':]|dessert|lunch|dinner|latenight|brunch",
"",
GoodForMeal))) %>%
separate(GoodForMeal,
c("dessert", "latenight", "lunch", "dinner"),
", ",
extra="drop")
It should yield:
# A tibble: 1 x 4
# dessert latenight lunch dinner
# * <chr> <chr> <chr> <chr>
# False False True True
I hope this proves useful.
This is not an elegant solution (and long) but seems to work. I did change the data to make it more general. Hope this can be a good start.
# i made some changes in the data; remove lunch entry in the 4th element and remove dessert in the 1st
sampleData <- c("'dessert': False, 'latenight': False, 'lunch': True, 'dinner': True",
"'dessert': False, 'latenight': False, 'lunch': True, 'dinner': True",
NA,
"'dessert': False, 'latenight': False, 'dinner': True",
"'latenight': False, 'lunch': True, 'dinner': True")
# [1] "'dessert': False, 'latenight': False, 'lunch': True, 'dinner': True"
# [2] "'dessert': False, 'latenight': False, 'lunch': True, 'dinner': True"
# [3] NA
# [4] "'dessert': False, 'latenight': False, 'dinner': True"
# [5] "'latenight': False, 'lunch': True, 'dinner': True"
# not sure if this is necessary, but jsut to clean the data
sampleData <- gsub(x = sampleData, pattern = "'| ", replacement = "")
# i'm a data.table user, so i'll jsut use tstrsplit
# split the pairs within each elements first
x <- data.table::tstrsplit(sampleData, ",")
# split the header and the entry
test <- lapply(x, function(x) data.table::tstrsplit(x, ":", fixed = TRUE))
# get the headers
indexHeader <- do.call("rbind", lapply(test, function(x) x[[1]]))
# get the entries
indexValue <- do.call("rbind",
lapply(test, function(x){if(length(x) > 1){ return(x[[2]])}else{ return(x[[1]])} }))
# get unique headers
colNames <- unique(as.vector(indexHeader))
colNames <- colNames[!is.na(colNames)]
# determine the order of the entries using the header matrix
indexUse <- apply(indexHeader, 2, function(x) match(colNames, x))
# index the entry matrix using the above matching
resA <- mapply(FUN = function(x,y) x[y],
x = as.data.frame(indexValue),
y = as.data.frame(indexUse))
# convert to data frame
final <- as.data.frame(t(resA))
# rename columns
colnames(final) <- colNames
# should give something like this
final
# dessert latenight lunch dinner
# False False True True
# False False True True
# <NA> <NA> <NA> <NA>
# False False <NA> True
# <NA> False True True