I am trying to produce multiple frequency tables that are stratified by multiple independent variables. I can get this to work for one variable and one stratification variable, but my for-loop is broken.
library(tidyverse)
# Create example dataframe of survey data
df <- data.frame(
var1 = sample(1:7, 1000, replace = TRUE),
var2 = sample(1:7, 1000, replace = TRUE),
var3 = sample(1:7, 1000, replace = TRUE),
var4 = sample(1:7, 1000, replace = TRUE),
var5 = sample(1:7, 1000, replace = TRUE),
var6 = sample(1:7, 1000, replace = TRUE),
strat1 = sample(c("A", "B", "C"), 1000, replace = TRUE),
strat2 = sample(c("X", "Y"), 1000, replace = TRUE),
strat3 = sample(c("True", "False"), 1000, replace = TRUE)
)
Example that works for one variable and one stratification variable. I want to convert this code into a for loop:
temp_df <- df %>% count(var1)
temp_df$percent <- temp_df$n / sum(temp_df$n) * 10
strat_df <- temp_df %>%
left_join((df %>% group_by(var1, strat1) %>% count(var1) %>% pivot_wider(names_from = strat1, values_from = n)), by = "var1")
for(k in c("A","B","C")){
strat_df[paste0(k, "_pct")] <- (strat_df[[k]] / temp_df$n) * 100
}
I want this same sort of output, but with added columns for count and _pct of the other two stratification variables.
I've tried using the following for loop, but it's only giving me one row per variable and it only produces two columns for each strat variable, whereas the output I'm looking for would have a raw count and column percentage column for each category within a stratification variable. Since there are 3 strat vars, two having two categories and one having three categories, my desired output would have 13 columns including the column for "v#", "n", and "percent".
# Create a list of the variables of interest
variables <- c("var1", "var2", "var3", "var4", "var5", "var6")
# Create a list of the stratification variables
strats <- c("strat1", "strat2", "strat3")
# Create a loop that runs through each variable
for(i in variables){
# Create a frequency table for the current variable
temp_df <- df %>% count(!! i)
# Add a column for the percent of responses within each response category
temp_df$percent <- temp_df$n / sum(temp_df$n) * 100
# Add a column for the raw count for each category of the stratification variables
for(j in strats){
temp_df <- temp_df %>% group_by(!!i) %>% mutate( !!j := n() )
}
# Add a column for the percent of the stratification variable category within the response category
for(j in strats){
temp_df[paste0(j, "_pct")] <- (temp_df[[j]] / temp_df$n) * 100
}
assign(paste0(i,"_df"), temp_df)
}
This is what I would like my output to look like:
UPDATE:
Came up with a solution that outputs what I need:
for(i in variables){
j = sym(i)
temp_df <- df %>% count(!!j)
temp_df$percent <- temp_df$n / sum(temp_df$n) * 10
strat_df <- temp_df %>%
left_join((df %>% group_by(!!j, strat1) %>% count(!!j) %>% pivot_wider(names_from = strat1, values_from = n)), by = i) %>%
left_join((df %>% group_by(!!j, strat2) %>% count(!!j) %>% pivot_wider(names_from = strat2, values_from = n)), by = i) %>%
left_join((df %>% group_by(!!j, strat3) %>% count(!!j) %>% pivot_wider(names_from = strat3, values_from = n)), by = i)
for(k in c("A","B","C","X","Y","True","False")){
strat_df[paste0(k, "_pct")] <- (strat_df[[k]] / temp_df$n) * 100
}
assign(paste0(i,"_df"), strat_df)
Either convert to symbol and evaluate (!!) or use across as the variables looped are strings
for(i in variables){
# Create a frequency table for the current variable
temp_df <- df %>% count(across(all_of(i)))
# Add a column for the percent of responses within each response category
temp_df$percent <- temp_df$n / sum(temp_df$n) * 100
# Add a column for the raw count for each category of the stratification variables
strat_df <- temp_df %>%
left_join((df %>% group_by(across(all_of(c(i, "strat1")))) %>%
count(across(all_of(i))) %>%
pivot_wider(names_from = strat1, values_from = n)), by = i) %>%
left_join((df %>% group_by(across(all_of(c(i, "strat2")))) %>%
count(across(all_of(i))) %>%
pivot_wider(names_from = strat2, values_from = n)), by = i) %>%
left_join((df %>% group_by(across(all_of(c(i, "strat3")))) %>%
count(across(all_of(i))) %>%
pivot_wider(names_from = strat3, values_from = n)), by = i)
# Add a column for the percent of the stratification variable category within the response category
for(j in c("A","B","C","X","Y","True","False")){
strat_df[paste0(j, "_pct")] <- (strat_df[[j]] / temp_df$n) * 100
}
assign(paste0(i,"_df"), strat_df)
}
-output
> var1_df
var1 n percent A B C X Y False True A_pct B_pct C_pct X_pct Y_pct True_pct False_pct
1 1 121 12.1 36 42 43 59 62 63 58 29.75207 34.71074 35.53719 48.76033 51.23967 47.93388 52.06612
2 2 144 14.4 51 42 51 84 60 69 75 35.41667 29.16667 35.41667 58.33333 41.66667 52.08333 47.91667
3 3 147 14.7 41 39 67 60 87 73 74 27.89116 26.53061 45.57823 40.81633 59.18367 50.34014 49.65986
4 4 146 14.6 52 45 49 74 72 79 67 35.61644 30.82192 33.56164 50.68493 49.31507 45.89041 54.10959
5 5 165 16.5 51 57 57 86 79 76 89 30.90909 34.54545 34.54545 52.12121 47.87879 53.93939 46.06061
6 6 133 13.3 48 51 34 64 69 68 65 36.09023 38.34586 25.56391 48.12030 51.87970 48.87218 51.12782
7 7 144 14.4 53 44 47 67 77 73 71 36.80556 30.55556 32.63889 46.52778 53.47222 49.30556 50.69444
> var2_df
var2 n percent A B C X Y False True A_pct B_pct C_pct X_pct Y_pct True_pct False_pct
1 1 152 15.2 51 53 48 79 73 70 82 33.55263 34.86842 31.57895 51.97368 48.02632 53.94737 46.05263
2 2 147 14.7 49 46 52 73 74 55 92 33.33333 31.29252 35.37415 49.65986 50.34014 62.58503 37.41497
3 3 142 14.2 46 45 51 72 70 79 63 32.39437 31.69014 35.91549 50.70423 49.29577 44.36620 55.63380
4 4 147 14.7 50 48 49 74 73 72 75 34.01361 32.65306 33.33333 50.34014 49.65986 51.02041 48.97959
5 5 128 12.8 45 43 40 59 69 72 56 35.15625 33.59375 31.25000 46.09375 53.90625 43.75000 56.25000
6 6 152 15.2 37 52 63 74 78 83 69 24.34211 34.21053 41.44737 48.68421 51.31579 45.39474 54.60526
7 7 132 13.2 54 33 45 63 69 70 62 40.90909 25.00000 34.09091 47.72727 52.27273 46.96970 53.03030
Related
I am working on a data set which is large and having many columns. I am using data.table to speed up the calculations. However at certain points I am not sure how to go about and convert my data.table back to data.frame and do the calculation. This slows up the process. It would help a lot to have suggestions on how I can write the below in data.table. Below is a snap of my code on a dummy data -
library(data.table)
#### set the seed value
set.seed(9901)
#### create the sample variables for creating the data
p01 <- sample(1:100,1000,replace = T)
p02 <- sample(1:100,1000,replace = T)
p03 <- sample(1:100,1000,replace = T)
p04 <- sample(1:100,1000,replace = T)
p05 <- sample(1:100,1000,replace = T)
p06 <- sample(1:100,1000,replace = T)
p07 <- sample(1:100,1000,replace = T)
#### create the data.table
data <- data.table(cbind(p01,p02,p03,p04,p05,p06,p07))
###user input for last column
lcol <- 6
###calculate start column as last - 3
scol <- lcol-3
###calculate average for scol:lcol
data <- data[,avg:= apply(.SD,1,mean,na.rm=T),.SDcols=scol:lcol]
###converting to data.frame since do not know the solution in data.table
data <- as.data.frame(data)
###calculate the trend in percentage
data$t01 <- data[,lcol-00]/data[,"avg"]-1
data$t02 <- data[,lcol-01]/data[,"avg"]-1
data$t03 <- data[,lcol-02]/data[,"avg"]-1
data$t04 <- data[,lcol-03]/data[,"avg"]-1
data$t05 <- data[,lcol-04]/data[,"avg"]-1
###converting back to data.table
data <- as.data.table(data)
###calculate the min and max for the trend
data1 <- data[,`:=` (trend_min = apply(.SD,1,min,na.rm=T),
trend_max = apply(.SD,1,max,na.rm=T)),.SDcols=c(scol:lcol)]
###calculate flag if any of t04 OR t05 is an outlier for min and max values. This would be many columns in actual data
data1$flag1 <- ifelse(data1$t04 < data1$trend_min | data1$t04 > data1$trend_max,1,0)
data1$flag2 <- ifelse(data1$t05 < data1$trend_min | data1$t05 > data1$trend_max,1,0)
data1$flag <- ifelse(data1$flag1 == 1 | data1$flag2 == 1,1,0)
So basically, how can I -
calculate the percentages based on user input of column index. Note it is not simple divide but percentage
How can I create the flag variable....I think I need to use any function but not sure how....
Some steps can be made more efficient, i.e. instead of using the apply with MARGIN = 1, the mean, min, max can be replaced with rowMeans, pmin, pmax
library(data.table)
data[ , avg:= rowMeans(.SD, na.rm = TRUE) ,.SDcols=scol:lcol]
data[, sprintf('t%02d', 1:5) := lapply(.SD, function(x) x/avg -1),
.SDcol = patterns("^p0[1-5]")]
data[,`:=` (trend_min = do.call(pmin, c(.SD,na.rm=TRUE)),
trend_max = do.call(pmax, c(.SD,na.rm=TRUE)) ),.SDcols=c(scol:lcol)]
data
# p01 p02 p03 p04 p05 p06 p07 avg t01 t02 t03 t04 t05 trend_min trend_max
# 1: 35 53 22 82 100 59 69 65.75 -0.46768061 -0.19391635 -0.6653992 0.24714829 0.5209125 22 100
# 2: 78 75 15 65 70 69 66 54.75 0.42465753 0.36986301 -0.7260274 0.18721461 0.2785388 15 70
# 3: 15 45 27 61 63 75 99 56.50 -0.73451327 -0.20353982 -0.5221239 0.07964602 0.1150442 27 75
# 4: 41 80 13 22 63 84 17 45.50 -0.09890110 0.75824176 -0.7142857 -0.51648352 0.3846154 13 84
# 5: 53 9 75 47 25 75 66 55.50 -0.04504505 -0.83783784 0.3513514 -0.15315315 -0.5495495 25 75
# ---
# 996: 33 75 9 61 74 55 57 49.75 -0.33668342 0.50753769 -0.8190955 0.22613065 0.4874372 9 74
# 997: 24 68 74 11 43 75 37 50.75 -0.52709360 0.33990148 0.4581281 -0.78325123 -0.1527094 11 75
# 998: 62 78 82 97 56 50 74 71.25 -0.12982456 0.09473684 0.1508772 0.36140351 -0.2140351 50 97
# 999: 70 88 93 4 39 75 93 52.75 0.32701422 0.66824645 0.7630332 -0.92417062 -0.2606635 4 93
#1000: 20 50 99 94 62 66 98 80.25 -0.75077882 -0.37694704 0.2336449 0.17133956 -0.2274143 62 99
and then create the 'flag'
data[, flag := +(Reduce(`|`, lapply(.SD, function(x)
x < trend_min| x > trend_max))), .SDcols = t04:t05]
I have a big table like this (this is just an excerpt from the original table which has thousands of functions (rows) and many samples (columns, except the first one)):
function M123Q OO987 LKJY11
phi 9 2 0
3R 74 71 65
GlcNAc 1 0 1
And I need to reorder it like this adding two extra columns ("total_hits" column is the sum of all the numbers from column "hits" which have the same "ID" and "Percentage" is the product of "hits"/"total_hits"):
ID function hits total_hits percentage
M123Q phi 9 84 0.107142857
M123Q 3R 74 84 0.880952381
M123Q GlcNAc 1 84 0.011904762
OO987 phi 2 73 0.02739726
OO987 3R 71 73 0.97260274
OO987 GlcNAc 0 73 0
LKJY11 phi 0 66 0
LKJY11 3R 65 66 0.984848485
LKJY11 GlcNAc 1 66 0.015151515
I'm currently using R, so I'd very much appreciate an R solution if possible.
Many thanks.
Here is one way to do this where we reshape from 'wide' to 'long' (pivot_longer), grouped by 'ID', get the sum of 'hits' and the 'percentage'
library(dplyr)
library(tidyr)
df1 %>%
pivot_longer(cols = -function., names_to = "ID", values_to = "hits") %>%
arrange(ID) %>%
group_by(ID) %>%
mutate(total_hits = sum(hits), percentage = hits/total_hits)
# A tibble: 9 x 5
# Groups: ID [3]
# function. ID hits total_hits percentage
# <chr> <chr> <int> <int> <dbl>
#1 phi LKJY11 0 66 0
#2 3R LKJY11 65 66 0.985
#3 GlcNAc LKJY11 1 66 0.0152
#4 phi M123Q 9 84 0.107
#5 3R M123Q 74 84 0.881
#6 GlcNAc M123Q 1 84 0.0119
#7 phi OO987 2 73 0.0274
#8 3R OO987 71 73 0.973
#9 GlcNAc OO987 0 73 0
data
df1 <- structure(list(`function.` = c("phi", "3R", "GlcNAc"), M123Q = c(9L,
74L, 1L), OO987 = c(2L, 71L, 0L), LKJY11 = c(0L, 65L, 1L)),
class = "data.frame", row.names = c(NA,
-3L))
Base R solution:
# Reshape the dataframe long-ways:
df1 <- data.frame(reshape(df1,
idvar = "function.",
ids = unique(df1$function.),
direction = "long",
varying = names(df1)[names(df1) != "function."],
v.names = "hits",
times = names(df1)[names(df1) != "function."],
timevar = "ID"), row.names = NULL)
# Groupwise summation of hits (by ID):
df1$total_hits <- with(df1, ave(hits, ID, FUN = sum))
# Calculation of percentage:
df1$percentage <- df1$hits/df1$total_hits
I am trying to select random rows from a data frame with 1000 lines (and six columns) where the skewness of the line is larger than a given value (say Sk > 0.3).
I've generated the following data frame
df=data.frame(replicate(6,sample(10:100,1000,rep=TRUE)))
I can get row skewness from the fbasics package:
rowSkewness(df) gives:
[8] -0.2243295435 0.5306809351 0.0707122386 0.0341447417 0.3339384838 -0.3910593364 -0.6443905090
[15] 0.5603809206 0.4406091534 -0.3736108832 0.0397860038 0.9970040772 -0.7702547535 0.2065830354
But now, I need to select say 10 rows of the df which have rowskewness greater than say 0.1... May with
for (a in 1:10) {
sample.data[a,] = sample(x=df[which(rowSkewness(df[sample(1:nrow(df),1)>0.1),], size = 1, replace = TRUE)
}
or something like this?
Any thoughts on this will be appreciated.
thanks in advance.
you can use the sample_n() function or sample_frac() - makes your version a little shorter:
library(tidyr)
library(fBasics)
df=data.frame(replicate(6,sample(10:100,1000,rep=TRUE)))
x=df %>% dplyr::filter(rowSkewness(df)>0.1) %>% dplyr::sample_n(10)
Got it:
x=df %>% filter(rowSkewness(df)>0.1)
for (a in 1:samplesize) {
sample.data[a,] = sample(x=x, size = 1, replace = TRUE)
}
Just do a subset:
res1 <- DF[fBasics::rowSkewness(DF) > .1, ]
head(res1)
# X1 X2 X3 X4 X5 X6
# 7 56 28 21 93 74 24
# 8 33 56 23 44 10 12
# 12 29 19 29 38 94 95
# 13 35 51 54 98 66 10
# 14 12 51 24 23 36 68
# 15 50 37 81 22 55 97
Or with e1071::skewness:
res2 <- DF[apply(as.matrix(DF), 1, e1071::skewness) > .1, ]
stopifnot(all.equal(res1, res2))
Data
set.seed(42); DF <- data.frame(replicate(6, sample(10:100, 1000, rep=TRUE)))
I am not very experienced in if statements and loops in R.
Probably you can help me to solve my problem.
My task is to add +1 to df$fz if sum(df$fz) < 450, but in the same time I have to add +1 only to max values in df$fz till that moment when when sum(df$fz) is lower than 450
Here is my df
ID_PP <- c(3,6, 22, 30, 1234456)
z <- c(12325, 21698, 21725, 8378, 18979)
fz <- c(134, 67, 70, 88, 88)
df <- data.frame(ID_PP,z,fz)
After mutating the new column df$new_value, it should look like 134 68 71 88 89
At this moment I have this code, but it adds +1 to all values.
if (sum(df$fz ) < 450) {
mutate(df, new_value=fz+1)
}
I know that I can pick top_n(3, z) and add +1 only to this top, but it is not what I want, because in that case I have to pick a top manually after checking sum(df$fz)
From what I understood from #Oksana's question and comments, we probably can do it this way:
library(tidyverse)
# data
vru <- data.frame(
id = c(3, 6, 22, 30, 1234456),
z = c(12325, 21698, 21725, 8378, 18979),
fz = c(134, 67, 70, 88, 88)
)
# solution
vru %>% #
top_n(450 - sum(fz), z) %>% # subset by top z, if sum(fz) == 450 -> NULL
mutate(fz = fz + 1) %>% # increase fz by 1 for the subset
bind_rows( #
anti_join(vru, ., by = "id"), # take rows from vru which are not in subset
. # take subset with transformed fz
) %>% # bind thous subsets
arrange(id) # sort rows by id
# output
id z fz
1 3 12325 134
2 6 21698 68
3 22 21725 71
4 30 8378 88
5 1234456 18979 89
The clarifications in the comments helped. Let me know if this works for you. Of course, you can drop the cumsum_fz and leftover columns.
# Making variables to use in the calculation
df <- df %>%
arrange(fz) %>%
mutate(cumsum_fz = cumsum(fz),
leftover = 450 - cumsum_fz)
# Find the minimum, non-negative value to use for select values that need +1
min_pos <- min(df$leftover[df$leftover > 0])
# Creating a vector that adds 1 using the min_pos value and keeps
# the other values the same
df$new_value <- c((head(sort(df$fz), min_pos) + 1), tail(sort(df$fz), length(df$fz) - min_pos))
# Checking the sum of the new value
> sum(df$new_value)
[1] 450
>
> df
ID_PP z fz cumsum_fz leftover new_value
1 6 21698 67 67 383 68
2 22 21725 70 137 313 71
3 30 8378 88 225 225 89
4 1234456 18979 88 313 137 88
5 3 12325 134 447 3 134
EDIT:
Because utubun already posted a great tidyverse solution, I am going to translate my first one completely to base (it was a bit sloppy to mix the two anyway). Same logic as above, and using the data OP provided.
> # Using base
> df <- df[order(fz),]
>
> leftover <- 450 - cumsum(fz)
> min_pos <- min(leftover[leftover > 0])
> df$new_value <- c((head(sort(df$fz), min_pos) + 1), tail(sort(df$fz), length(df$fz) - min_pos))
>
> sum(df$new_value)
[1] 450
> df
ID_PP z fz new_value
2 6 21698 67 68
3 22 21725 70 71
4 30 8378 88 89
5 1234456 18979 88 88
1 3 12325 134 134
I have one column with 950 numbers. I want to sum row 1:40 and place it in a new column on row 50, then sum row 2:41 and place it on row 51 in the new column and so on. How do I do?
You can use the function RcppRoll::roll_sum()
Hope this helps:
r <- 50
df1 <- data.frame(c1 = 1:951)
v1 <- RcppRoll::roll_sum(df1$c1, n=40)
df1$c2 <- c(rep(NA, r), v1[1:(nrow(df1)-r)])
View(df1) # in RStudio
You decide what happens with the sum from row 911 onwards (I've ignored them)
You can use RcppRoll::roll_sum() and dplyr::lag()...
df <- data.frame(v = 1:950)
library(dplyr)
library(RcppRoll)
range <- 40 # how many values to sum, i.e. window size
offset <- 10 # e.g sum(1:40) goes to row 50
df <- mutate(df, roll_sum = RcppRoll::roll_sum(lag(v, n = offset),
n = range, fill = NA, align = "right"))
df[(range+offset):(range+offset+5), ]
# v roll_sum
# 50 50 820
# 51 51 860
# 52 52 900
# 53 53 940
# 54 54 980
# 55 55 1020
sum(1:range); sum(2:(range+1))
# [1] 820
# [1] 860