Plot a list using R - r

I would like to know how to plot a list.
Now I have a list
[[1977]] keyword1, keyword2, keyword3, ...
[[1978]] keyword2, keyword5, ...
...
[[2018]] keyword1, keywords3, ...
length(mylist) = 2018
lengts(mylist) = 0,0,0,0,0,......
dput(head(mylist)) = list(NULL, NULL, NULL, NULL, NULL, NULL)
And I would like to plot it using keywords' frequencies as the y-axis and 1977~2018 as the x-axis.
So it should have many lines equal to the number of the keywords. Does anyone have any idea?

Try this example:
# example data
set.seed(1); myList <- list(sample(LETTERS[1:3], 10, replace = TRUE),
sample(LETTERS[1:3], 10, replace = TRUE),
sample(LETTERS[1:3], 10, replace = TRUE),
sample(LETTERS[1:3], 10, replace = TRUE),
sample(LETTERS[1:3], 10, replace = TRUE))
names(myList) <- 1977:1981
library(ggplot2)
library(dplyr)
plotDat <- stack(myList) %>%
mutate(myYears = as.numeric(as.character(ind)),
myWords = values) %>%
group_by(myYears, myWords) %>%
summarise(myCount = n())
ggplot(plotDat, aes(x = myYears, y = myCount, col = myWords)) +
geom_line()

You can probably use data.table::rbindlist() to create a long data.table. Summarise to a frequency-table to plot with ggplot-functions
# using example data from #zx8754's answer.
library( data.table )
library( ggplot2 )
dt <- data.table::rbindlist( lapply( myList, as.data.table ), idcol = "year" )
dt <- dt[, .N, by = list(year, V1) ]
ggplot( data = dt, aes( x = year, y = N, group = V1, fill = V1 )) + geom_col( color = "black" )

Related

New column with random boolean values while controlling the ratio of TRUE/FALSE per category

In R I've got a dataset like this one:
df <- data.frame(
ID = c(1:30),
x1 = seq(0, 1, length.out = 30),
x2 = seq(100, 3000, length.out = 30),
category = gl(3, 10, labels = c("NEGATIVE", "NEUTRAL", "POSITIVE"))
)
Now I want to add a new column with randomized boolean values, but inside each category the proportion of TRUE and FALSE values should be the same (i.e. the randomizing process should generate the same count of true and false values, in the above data frame 5 TRUEs and 5 FALSEs in each of the 3 categories). How to do this?
You can sample a vector of "TRUE" and "FALSE" values without replacement so you have a randomized and balanced column in your data-frame.
sample(rep(c("TRUE","FALSE"),each=5),10,replace=FALSE)
Based on Yacine Hajji answer:
addRandomBool <- function(df, p){
n <- ceiling(nrow(df) * p)
df$bool <- sample(rep(c("TRUE","FALSE"), times = c(n, nrow(df) - n)))
df
}
Reduce(rbind, lapply(split(df, df$category), addRandomBool, p = 0.5))
where parametar p determines the proportion of TRUE.
This will sample within each group from a vector of 5 TRUE and 5 FALSE without replacement. It will assume that there are always 10 records per group.
library(dplyr)
library(tidyr)
df <- data.frame(
ID = c(1:30),
x1 = seq(0, 1, length.out = 30),
x2 = seq(100, 3000, length.out = 30),
category = gl(3, 10, labels = c("NEGATIVE", "NEUTRAL", "POSITIVE"))
)
set.seed(pi)
df %>%
group_by(category) %>%
nest() %>%
mutate(data = lapply(data,
function(df){ # Function to saple and assign the new_col
df$new_col <- sample(rep(c(FALSE, TRUE),
each = 5),
size = 10,
replace = FALSE)
df
})) %>%
unnest(cols = "data")
This next example is a little more generalized, but still assumes (approximately) even distribution of TRUE and FALSE within a group. But it can accomodate variable group sizes, and even groups with odd numbers of records (but will favor FALSE for odd numbers of records)
library(dplyr)
library(tidyr)
df <- data.frame(
ID = c(1:30),
x1 = seq(0, 1, length.out = 30),
x2 = seq(100, 3000, length.out = 30),
category = gl(3, 10, labels = c("NEGATIVE", "NEUTRAL", "POSITIVE"))
)
set.seed(pi)
df %>%
group_by(category) %>%
nest() %>%
mutate(data = lapply(data,
function(df){
df$new_col <- sample(rep(c(FALSE, TRUE),
length.out = nrow(df)),
size = nrow(df),
replace = FALSE)
df
})) %>%
unnest(cols = "data")
Maintaining Column Order
A couple of options to maintain the column order:
First, you can save the column order before you do your group_by - nest, and then use select to set the order when you're done.
set.seed(pi)
orig_col <- names(df) # original column order
df %>%
group_by(category) %>%
nest() %>%
mutate(data = lapply(data,
function(df){
df$new_col <- sample(rep(c(FALSE, TRUE),
length.out = nrow(df)),
size = nrow(df),
replace = FALSE)
df
})) %>%
unnest(cols = "data") %>%
select_at(c(orig_col, "new_col")) # Restore the column order
Or you can use a base R solution that doesn't change the column order in the first place
df <- split(df, df["category"])
df <- lapply(df,
function(df){
df$new_col <- sample(rep(c(FALSE, TRUE),
length.out = nrow(df)),
size = nrow(df),
replace = FALSE)
df
})
do.call("rbind", c(df, list(make.row.names = FALSE)))
There are likely a dozen other ways to do this, and probably more efficient ways that I'm not thinking of.

Convert dplyr::select structure to base R

I have this data:
df_1 <- data.frame(
x = replicate(
n = 10, expr = runif(n = 1000, min = 20, max = 100)
)
)
My code:
library(dplyr)
df_1 |>
(\(x) cbind(x, r = apply(x[colnames(x = select(x, where(is.numeric) & head(x = everything(x), 2) & starts_with("x.")))], 1, sum, na.rm = T)))()
I tried use [ instead colnames, but doesn't work. I want convert this part (simultaneously, as a dplyr::select structure made above):
[colnames(x = select(x, where(is.numeric) & head(x = everything(x), 2) & starts_with("x.")))]
to base R.
Just do:
transform(df_1, r = x.1 + x.2)
or even:
cbind(df_1, r = rowSums(df_1[1:2]))
or even:
cbind(df_1, r = df_1[1] + df_1[2])

chi square over multiple groups and variables

I have a huge dataset with several groups (factors with between 2 to 6 levels), and dichotomous variables (0, 1).
example data
DF <- data.frame(
group1 = sample(x = c("A","B","C","D"), size = 100, replace = T),
group2 = sample(x = c("red","blue","green"), size = 100, replace = T),
group3 = sample(x = c("tiny","small","big","huge"), size = 100, replace = T),
var1 = sample(x = 0:1, size = 100, replace = T),
var2 = sample(x = 0:1, size = 100, replace = T),
var3 = sample(x = 0:1, size = 100, replace = T),
var4 = sample(x = 0:1, size = 100, replace = T),
var5 = sample(x = 0:1, size = 100, replace = T))
I want to do a chi square for every group, across all the variables.
library(tidyverse)
library(rstatix)
chisq_test(DF$group1, DF$var1)
chisq_test(DF$group1, DF$var2)
chisq_test(DF$group1, DF$var3)
...
etc
I managed to make it work by using two nested for loops, but I'm sure there is a better solution
groups <- c("group1","group2","group3")
vars <- c("var1","var2","var3","var4","var5")
results <- data.frame()
for(i in groups){
for(j in vars){
test <- chisq_test(DF[,i], DF[,j])
test <- mutate(test, group=i, var=j)
results <- rbind(results, test)
}
}
results
I think I need some kind of apply function, but I can't figure it out
Here is one way to do it with apply. I am sure there is an even more elegant way to do it with dplyr. (Note that here I extract the p.value of the test, but you can extract something else or the whole test result if you prefer).
res <- apply(DF[,1:3], 2, function(x) {
apply(DF[,4:7], 2,
function(y) {chisq.test(x,y)$p.value})
})
Here's a quick and easy dplyr solution, that involves transforming the data into long format keyed by group and var, then running the chi-sq test on each combination of group and var.
DF %>%
pivot_longer(starts_with("group"), names_to = "group", values_to = "group_val") %>%
pivot_longer(starts_with("var"), names_to = "var", values_to = "var_val") %>%
group_by(group, var) %>%
summarise(chisq_test(group_val, var_val)) %>%
ungroup()

How to add values from one data frame to another based on a distance criteria.

I have two different datasets,
df1 <- data.frame(
x = c(1.25:10.25),
y = c(1.25:10.25),
val = sample(50:150, 100, replace = FALSE)
)
df2 <- data.frame(
x = c(1:10),
y = c(1:10),
val_2 = sample(50:150, 100, replace = FALSE)
)
ggplot(df1, aes(x=x, y=y)) +
geom_tile(aes(fill=val)) + coord_equal() +
scale_fill_gradient(low = "yellow", high="red") +
geom_point(data = df2, aes(x = x, y = y, size = val_2), shape = 21, colour ="purple")
the resulting plot looks like this,
I would like to assign the values from df1 to df2 based on the box in which the df2 bubbles lie. The result I am looking for will be a copy of df2, but with an added column of df1 values. So something like
df2$val_1 <-
and the right-hand side code might have some distance criteria.
Considering the sample data presented and the example to reproduce, the solution can be given by:
require(dplyr)
df2$val_1 <- left_join(df2,
df1 %>% mutate(x = round(x,0), y = round(y,0)),
by = c("x" = "x", "y" = "y")) %>%
pull(val)
Instead, if you want to approach it using a more generalizable approach based on distance. I would suggest the following:
First of all, it is important to assign a primary key to both data.frame df1 and df2:
df1 <- data.frame(
ID = seq.int(1:100),
x = c(1.25:10.25),
y = c(1.25:10.25),
val = sample(50:150, 100, replace = FALSE)
)
df2 <- data.frame(
ID = seq.int(1:100),
x = c(1:10),
y = c(1:10),
val_2 = sample(50:150, 100, replace = FALSE)
)
We need to install pdist package because allow the computation of a distance matrix, in this solution using euclidean distance considering variables x and y
require(pdist)
dists <- pdist(df2[c("x", "y")],
df1[c("x", "y")])
Let's convert the output of pdist() function to a matrix
dists <- as.matrix(dists)
Now, based on the resulting matrix, we want to obtain a data.frame that for each element of df2 it gives us the ID of the nearest element of df1
assign_value <- data.frame(ID_df2 = df2$ID,
ID_df1 = apply(dists, 1, which.min))
We need to integrate the resulting 2-column data.frame with the val feature of df1:
assign_value <- left_join(assign_value,
df1[c("ID", "val")],
by = c("ID_df1" = "ID"))
Finally, we have obtained a data.frame with the following structure: "each row refers to a unique element of df2 and it is linked to the ID of the nearest element in df1 and its val":
ID_df2 ID_df1 val
1 1 1 70
2 2 2 132
To obtain the final data.frame we just have to perform a simple left_join using the desired features.
alternative_solution <- dplyr::left_join(df2,
assign_value[c("ID_df2", "val")],
by = c("ID" = "ID_df2"))
> identical(df2$val_2, alternative_solution$val)
[1] TRUE

R Formattable apply two format to an area

I'm trying to use formattable awesome package and get a table with percentages and color scaled on multiple columns.
Here is the code
set.seed(123)
df <- data.frame(id = 1:10,
a = rnorm(10), b = rnorm(10), c = rnorm(10))
df$a <- percent(df$a)
df$b <- percent(df$b)
df$c <- percent(df$c)
table_with_percent_but_color_not_scaled <- formattable(df, list(a = color_tile("transparent", "pink")
, b= color_tile("transparent", "pink")
, c= color_tile("transparent", "pink")))
table_with_color_scaled_but_not_percent <- formattable(df, list(area(col = 2:4) ~ color_tile("transparent","pink")))
Problem is that table_with_color_scaled_but_not_percent don't keep the percentage format :
and table_with_percent_but_color_not_scaled don't keep the same scale for coloring the colors:
Ideally I would like to use the area functionality, since my df number of columns and name will change in my final code.
Any idea ?
Thanks!
I had the same problem a while back and had to create my own formatter. Here's the formatter with code used to create a table similar to yours. Just adjust the tags inside the style.
library(tidyverse)
library(formattable)
colorbar <- function(color = "lightgray", fun = "comma", digits = 0) {
fun <- match.fun(fun)
formatter("span", x ~ fun(x, digits = digits),
style = function(y) style(
display = "inline-block",
direction = "rtl",
"border-radius" = "4px",
"padding-right" = "2px",
"background-color" = csscolor(color),
width = percent(proportion(as.numeric(y), na.rm = TRUE))
)
)
}
set.seed(123)
df <- data.frame(id = as.factor(1:10),
a = rnorm(10), b = rnorm(10), c = rnorm(10)) %>%
mutate_if(is.numeric, percent)
tbl <- df %>%
formattable(list(area(col = 2:4) ~ colorbar(color = "pink", fun = "percent", digits = 2))) %>%
as.htmlwidget()
If you have questions, let me know!

Resources