Related
In R I've got a dataset like this one:
df <- data.frame(
ID = c(1:30),
x1 = seq(0, 1, length.out = 30),
x2 = seq(100, 3000, length.out = 30),
category = gl(3, 10, labels = c("NEGATIVE", "NEUTRAL", "POSITIVE"))
)
Now I want to add a new column with randomized boolean values, but inside each category the proportion of TRUE and FALSE values should be the same (i.e. the randomizing process should generate the same count of true and false values, in the above data frame 5 TRUEs and 5 FALSEs in each of the 3 categories). How to do this?
You can sample a vector of "TRUE" and "FALSE" values without replacement so you have a randomized and balanced column in your data-frame.
sample(rep(c("TRUE","FALSE"),each=5),10,replace=FALSE)
Based on Yacine Hajji answer:
addRandomBool <- function(df, p){
n <- ceiling(nrow(df) * p)
df$bool <- sample(rep(c("TRUE","FALSE"), times = c(n, nrow(df) - n)))
df
}
Reduce(rbind, lapply(split(df, df$category), addRandomBool, p = 0.5))
where parametar p determines the proportion of TRUE.
This will sample within each group from a vector of 5 TRUE and 5 FALSE without replacement. It will assume that there are always 10 records per group.
library(dplyr)
library(tidyr)
df <- data.frame(
ID = c(1:30),
x1 = seq(0, 1, length.out = 30),
x2 = seq(100, 3000, length.out = 30),
category = gl(3, 10, labels = c("NEGATIVE", "NEUTRAL", "POSITIVE"))
)
set.seed(pi)
df %>%
group_by(category) %>%
nest() %>%
mutate(data = lapply(data,
function(df){ # Function to saple and assign the new_col
df$new_col <- sample(rep(c(FALSE, TRUE),
each = 5),
size = 10,
replace = FALSE)
df
})) %>%
unnest(cols = "data")
This next example is a little more generalized, but still assumes (approximately) even distribution of TRUE and FALSE within a group. But it can accomodate variable group sizes, and even groups with odd numbers of records (but will favor FALSE for odd numbers of records)
library(dplyr)
library(tidyr)
df <- data.frame(
ID = c(1:30),
x1 = seq(0, 1, length.out = 30),
x2 = seq(100, 3000, length.out = 30),
category = gl(3, 10, labels = c("NEGATIVE", "NEUTRAL", "POSITIVE"))
)
set.seed(pi)
df %>%
group_by(category) %>%
nest() %>%
mutate(data = lapply(data,
function(df){
df$new_col <- sample(rep(c(FALSE, TRUE),
length.out = nrow(df)),
size = nrow(df),
replace = FALSE)
df
})) %>%
unnest(cols = "data")
Maintaining Column Order
A couple of options to maintain the column order:
First, you can save the column order before you do your group_by - nest, and then use select to set the order when you're done.
set.seed(pi)
orig_col <- names(df) # original column order
df %>%
group_by(category) %>%
nest() %>%
mutate(data = lapply(data,
function(df){
df$new_col <- sample(rep(c(FALSE, TRUE),
length.out = nrow(df)),
size = nrow(df),
replace = FALSE)
df
})) %>%
unnest(cols = "data") %>%
select_at(c(orig_col, "new_col")) # Restore the column order
Or you can use a base R solution that doesn't change the column order in the first place
df <- split(df, df["category"])
df <- lapply(df,
function(df){
df$new_col <- sample(rep(c(FALSE, TRUE),
length.out = nrow(df)),
size = nrow(df),
replace = FALSE)
df
})
do.call("rbind", c(df, list(make.row.names = FALSE)))
There are likely a dozen other ways to do this, and probably more efficient ways that I'm not thinking of.
Hello everyone and good night. I would like to know if it is possible to create a function to simplify the creation of a chart with Echarts4r in r. Im trying but I get the error Error: Can't subset columns that don't exist.. Anyone knows how I can fix it? The code im using is the following:
library(echarts4r)
graf_func <- function(dataframe, vary, varx){
grafico <- base |>
e_charts(vary) |>
e_bar(varx) |>
e_tooltip(trigger = "axis")
return(grafico)
}
df <- data.frame(
var1 = runif(10, min = 100, max = 200),
var2 = runif(10, min = 10, max = 200)
)
graf_func(dataframe = df, vary = var1, varx = var2)
Use the functions e_charts_ and e_bar_ and pass the column names as character.
library(echarts4r)
graf_func <- function(dataframe, vary, varx){
grafico <- dataframe |>
e_charts_(vary) |>
e_bar_(varx) |>
e_tooltip(trigger = "axis")
return(grafico)
}
df <- data.frame(
var1 = runif(10, min = 100, max = 200),
var2 = runif(10, min = 10, max = 200)
)
graf_func(dataframe = df, vary = "var1", varx = "var2")
I would like to know how to plot a list.
Now I have a list
[[1977]] keyword1, keyword2, keyword3, ...
[[1978]] keyword2, keyword5, ...
...
[[2018]] keyword1, keywords3, ...
length(mylist) = 2018
lengts(mylist) = 0,0,0,0,0,......
dput(head(mylist)) = list(NULL, NULL, NULL, NULL, NULL, NULL)
And I would like to plot it using keywords' frequencies as the y-axis and 1977~2018 as the x-axis.
So it should have many lines equal to the number of the keywords. Does anyone have any idea?
Try this example:
# example data
set.seed(1); myList <- list(sample(LETTERS[1:3], 10, replace = TRUE),
sample(LETTERS[1:3], 10, replace = TRUE),
sample(LETTERS[1:3], 10, replace = TRUE),
sample(LETTERS[1:3], 10, replace = TRUE),
sample(LETTERS[1:3], 10, replace = TRUE))
names(myList) <- 1977:1981
library(ggplot2)
library(dplyr)
plotDat <- stack(myList) %>%
mutate(myYears = as.numeric(as.character(ind)),
myWords = values) %>%
group_by(myYears, myWords) %>%
summarise(myCount = n())
ggplot(plotDat, aes(x = myYears, y = myCount, col = myWords)) +
geom_line()
You can probably use data.table::rbindlist() to create a long data.table. Summarise to a frequency-table to plot with ggplot-functions
# using example data from #zx8754's answer.
library( data.table )
library( ggplot2 )
dt <- data.table::rbindlist( lapply( myList, as.data.table ), idcol = "year" )
dt <- dt[, .N, by = list(year, V1) ]
ggplot( data = dt, aes( x = year, y = N, group = V1, fill = V1 )) + geom_col( color = "black" )
Basically I have a vector names of all names, and a dataframe df with a BIN (0/1) field and a NAME field. For every row with BIN==0, I want to create a duplicate row but with 1 instead and add it to the bottom of df with a different name. Here's what I have to select a new name, given the current name:
sample(names[names!=name], 1)
But I'm not sure how to vectorize this and furthermore add it to df with the same data from BIN.
EDIT:
Sample data:
df = data.frame(BIN=c(1,0,1), NAME=c("alice","bob","cate"))
names = c("alice","bob","cate","dan")
I got closer with something like this:
rbind(df, df %>% filter(BIN == 0) %>%
mutate(NAME = sample(names[names!=NAME],1)))
But I get an error: In binattr(e1, e2): length(e1) not a multiple of length(e2).
Here's a simple approach. I think it's pretty straightforward, let me know if you have questions:
rename = subset(df, BIN == 0)
rename$NEW_NAME = sample(names, size = nrow(rename), replace = TRUE)
while(any(rename$NAME == rename$NEW_NAME)) {
matches = rename$NAME == rename$NEW_NAME
rename$NEW_NAME[matches] = sample(names, size = sum(matches), replace = TRUE)
}
rename$BIN = 1
rename$NAME = rename$NEW_NAME
rename$NEW_NAME = NULL
result = rbind(df, rename)
result
# BIN NAME
# 1 1 alice
# 2 0 bob
# 3 1 cate
# 21 1 alice
Here's another approach, less clear but more efficient. This is the "right" way to do it, but it requires a little bit more thought and explanation.
df$NAME = factor(df$NAME, levels = names)
rename = subset(df, BIN == 0)
n = length(names)
# we will increment each level number with a random integer from
# 1 to n - 1 (with a mod to make it cyclical)
offset = sample(1:(n - 1), size = nrow(rename), replace = TRUE)
adjusted = (as.integer(rename$NAME) + offset) %% n
# reconcile 1-indexed factor levels with 0-indexed mod operator
adjusted[adjusted == 0] = n
rename$NAME = names[adjusted]
rename$BIN = 1
result = rbind(df, rename)
(or, rewritten for dplyr)
df = mutate(df, NAME = factor(NAME, levels = names))
n = length(names)
df %>% filter(BIN == 0) %>%
mutate(
offset = sample(1:(n - 1), size = n(), replace = TRUE),
adjusted = (as.integer(NAME) + offset) %% n,
adjusted = if_else(adjusted == 0, n, adjusted),
NAME = names[adjusted],
BIN = 1
) %>%
select(-offset, -adjusted) %>%
rbind(df, .)
Since your issue is the vectorization part, I'd recommend testing answer on a sample case with more than one BIN 0 row, I used this:
df = data.frame(BIN=c(1,0,1,0,0,0,0,0,0), NAME=rep(c("alice","bob","cate"), 3))
And, because I was curious, here's a benchmark for 10k rows with 26 names. Results first, code below:
# Unit: milliseconds
# expr min lq mean median uq max neval
# while_loop 34.070438 34.327020 37.53357 35.548047 39.922918 46.206454 10
# increment 1.397617 1.458592 1.88796 1.526512 2.123894 3.196104 10
# increment_dplyr 24.002169 24.681960 25.50568 25.374429 25.750548 28.054954 10
# map_char 346.531498 347.732905 361.82468 359.736403 374.648635 383.575265 10
The "clever" way is by far the fastest. My guess is the dplyr slowdown is because we can't do the direct replacement of only the relevant bits of adjusted, and instead have to add the overhead of if_else. That and we are actually adding columns to the data frame for adjusted and offset rather than dealing with vectors. This is enough to make it almost as slow as the while loop approach, which is still 10x faster than the map_chr which has to go one row at a time.
nn = 10000
df = data.frame(
BIN = sample(0:1, size = nn, replace = TRUE, prob = c(0.7, 0.3)),
NAME = factor(sample(letters, size = nn, replace = TRUE), levels = letters)
)
get.new.name <- function(c){
return(sample(names[names!=c],1))
}
microbenchmark::microbenchmark(
while_loop = {
rename = subset(df, BIN == 0)
rename$NEW_NAME = sample(names, size = nrow(rename), replace = TRUE)
while (any(rename$NAME == rename$NEW_NAME)) {
matches = rename$NAME == rename$NEW_NAME
rename$NEW_NAME[matches] = sample(names, size = sum(matches), replace = TRUE)
}
rename$BIN = 1
rename$NAME = rename$NEW_NAME
rename$NEW_NAME = NULL
result = rbind(df, rename)
},
increment = {
rename = subset(df, BIN == 0)
n = length(names)
# we will increment each level number with a random integer from
# 1 to n - 1 (with a mod to make it cyclical)
offset = sample(1:(n - 1), size = nrow(rename), replace = TRUE)
adjusted = (as.integer(rename$NAME) + offset) %% n
# reconcile 1-indexed factor levels with 0-indexed mod operator
adjusted[adjusted == 0] = n
rename$NAME = names[adjusted]
rename$BIN = 1
},
increment_dplyr = {
n = length(names)
df %>% filter(BIN == 0) %>%
mutate(
offset = sample(1:(n - 1), size = n(), replace = TRUE),
adjusted = (as.integer(NAME) + offset) %% n,
adjusted = if_else(adjusted == 0, n, adjusted),
NAME = names[adjusted],
BIN = 1
) %>%
select(-offset,-adjusted)
},
map_char = {
new.df <- df %>% filter(BIN == 0) %>%
mutate(NAME = map_chr(NAME, get.new.name)) %>%
mutate(BIN = 1)
},
times = 10
)
Well I didn't intend to answer my own question but I did find a simpler solution. I think it's better than using rowwise() but I don't know if it's necessarily the most efficient way.
library(tidyverse)
get.new.name <- function(c){
return(sample(names[names!=c],1))
}
new.df <- rbind(df, df %>% filter(BIN == 0) %>%
mutate(NAME = map_chr(NAME, get.new.name)) %>%
mutate(BIN = 1)
map_char ended up being pretty important instead of just map since the latter would return a weird list of lists.
A little weird but I think this should be what you want:
library(tidyverse)
df <- data.frame(BIN=c(1,0,1), NAME=c("alice","bob","cate"), stringsAsFactors = FALSE)
names <- c("alice","bob","cate","dan")
df %>%
mutate(NAME_new = ifelse(BIN == 0, sample(names, n(), replace = TRUE), NA)) %>%
gather(name_type, NAME, NAME:NAME_new, na.rm = TRUE) %>%
mutate(BIN = ifelse(name_type == "NAME_new", 1, BIN)) %>%
select(-name_type)
Output:
BIN NAME
1 1 alice
2 0 bob
3 1 cate
4 1 alice
I would like to summarise pipeable functions (of magrittr/dplyr) into shorter function-"blocks" (to hopefully have more readable code). For example:
library(dplyr)
d <- tbl_df(data.frame(A = rep(LETTERS[2:5], each = 5),
M = rep(letters[1:2], times = 10),
X = round(rnorm(20, 10, 2), 1)))
# I want to replace this
# ----------------------
d %>%
group_by(A) %>%
summarise(X = mean(X)) -> d_test_1
# with this
# ---------
my_mean <- function(d, by_var, x) {
expr <- substitute(by_var) # group variable, seems ok
expr_2 <- substitute(expression(x = mean(x))) # calculate mean
print(deparse(expr_2))
# problem: x = mean(x) is only substituted to x = mean(X) .. 1 capital x, should be 2
expr_3 <- parse(text = paste(deparse(substitute(x)), "=mean(",
deparse(substitute(x)), ")"))
print(deparse(expr_3))
# expr_3 does not work either
d %>%
group_by(eval(expr)) %>%
#summarise(X = mean(X)) -> d # uses right group variable
summarise(eval(expr_3)) -> d # uses wrong group variable <> side-effect of "expr"?
invisible(d)
}
# this is the short version I am after
d %>%
my_mean(A, X) -> d_test_2
d_test_1
d_test_2
Thx & kind regards
Maybe somebody else does not know where to look either:
library(dplyr)
d <- tbl_df(data.frame(A = rep(LETTERS[2:5], each = 5),
M = rep(letters[1:2], times = 10),
X = round(rnorm(20, 10, 2), 1),
stringsAsFactors = F))
my_mean <- function(d, by_var, x) {
d %>%
group_by(!!enquo(by_var)) %>%
summarise(!!quo_name(enquo(x)) := mean(!!enquo(x)))
}
d %>%
my_mean(A, X) -> want
want