I have a big table like this (this is just an excerpt from the original table which has thousands of functions (rows) and many samples (columns, except the first one)):
function M123Q OO987 LKJY11
phi 9 2 0
3R 74 71 65
GlcNAc 1 0 1
And I need to reorder it like this adding two extra columns ("total_hits" column is the sum of all the numbers from column "hits" which have the same "ID" and "Percentage" is the product of "hits"/"total_hits"):
ID function hits total_hits percentage
M123Q phi 9 84 0.107142857
M123Q 3R 74 84 0.880952381
M123Q GlcNAc 1 84 0.011904762
OO987 phi 2 73 0.02739726
OO987 3R 71 73 0.97260274
OO987 GlcNAc 0 73 0
LKJY11 phi 0 66 0
LKJY11 3R 65 66 0.984848485
LKJY11 GlcNAc 1 66 0.015151515
I'm currently using R, so I'd very much appreciate an R solution if possible.
Many thanks.
Here is one way to do this where we reshape from 'wide' to 'long' (pivot_longer), grouped by 'ID', get the sum of 'hits' and the 'percentage'
library(dplyr)
library(tidyr)
df1 %>%
pivot_longer(cols = -function., names_to = "ID", values_to = "hits") %>%
arrange(ID) %>%
group_by(ID) %>%
mutate(total_hits = sum(hits), percentage = hits/total_hits)
# A tibble: 9 x 5
# Groups: ID [3]
# function. ID hits total_hits percentage
# <chr> <chr> <int> <int> <dbl>
#1 phi LKJY11 0 66 0
#2 3R LKJY11 65 66 0.985
#3 GlcNAc LKJY11 1 66 0.0152
#4 phi M123Q 9 84 0.107
#5 3R M123Q 74 84 0.881
#6 GlcNAc M123Q 1 84 0.0119
#7 phi OO987 2 73 0.0274
#8 3R OO987 71 73 0.973
#9 GlcNAc OO987 0 73 0
data
df1 <- structure(list(`function.` = c("phi", "3R", "GlcNAc"), M123Q = c(9L,
74L, 1L), OO987 = c(2L, 71L, 0L), LKJY11 = c(0L, 65L, 1L)),
class = "data.frame", row.names = c(NA,
-3L))
Base R solution:
# Reshape the dataframe long-ways:
df1 <- data.frame(reshape(df1,
idvar = "function.",
ids = unique(df1$function.),
direction = "long",
varying = names(df1)[names(df1) != "function."],
v.names = "hits",
times = names(df1)[names(df1) != "function."],
timevar = "ID"), row.names = NULL)
# Groupwise summation of hits (by ID):
df1$total_hits <- with(df1, ave(hits, ID, FUN = sum))
# Calculation of percentage:
df1$percentage <- df1$hits/df1$total_hits
Related
I am trying to produce multiple frequency tables that are stratified by multiple independent variables. I can get this to work for one variable and one stratification variable, but my for-loop is broken.
library(tidyverse)
# Create example dataframe of survey data
df <- data.frame(
var1 = sample(1:7, 1000, replace = TRUE),
var2 = sample(1:7, 1000, replace = TRUE),
var3 = sample(1:7, 1000, replace = TRUE),
var4 = sample(1:7, 1000, replace = TRUE),
var5 = sample(1:7, 1000, replace = TRUE),
var6 = sample(1:7, 1000, replace = TRUE),
strat1 = sample(c("A", "B", "C"), 1000, replace = TRUE),
strat2 = sample(c("X", "Y"), 1000, replace = TRUE),
strat3 = sample(c("True", "False"), 1000, replace = TRUE)
)
Example that works for one variable and one stratification variable. I want to convert this code into a for loop:
temp_df <- df %>% count(var1)
temp_df$percent <- temp_df$n / sum(temp_df$n) * 10
strat_df <- temp_df %>%
left_join((df %>% group_by(var1, strat1) %>% count(var1) %>% pivot_wider(names_from = strat1, values_from = n)), by = "var1")
for(k in c("A","B","C")){
strat_df[paste0(k, "_pct")] <- (strat_df[[k]] / temp_df$n) * 100
}
I want this same sort of output, but with added columns for count and _pct of the other two stratification variables.
I've tried using the following for loop, but it's only giving me one row per variable and it only produces two columns for each strat variable, whereas the output I'm looking for would have a raw count and column percentage column for each category within a stratification variable. Since there are 3 strat vars, two having two categories and one having three categories, my desired output would have 13 columns including the column for "v#", "n", and "percent".
# Create a list of the variables of interest
variables <- c("var1", "var2", "var3", "var4", "var5", "var6")
# Create a list of the stratification variables
strats <- c("strat1", "strat2", "strat3")
# Create a loop that runs through each variable
for(i in variables){
# Create a frequency table for the current variable
temp_df <- df %>% count(!! i)
# Add a column for the percent of responses within each response category
temp_df$percent <- temp_df$n / sum(temp_df$n) * 100
# Add a column for the raw count for each category of the stratification variables
for(j in strats){
temp_df <- temp_df %>% group_by(!!i) %>% mutate( !!j := n() )
}
# Add a column for the percent of the stratification variable category within the response category
for(j in strats){
temp_df[paste0(j, "_pct")] <- (temp_df[[j]] / temp_df$n) * 100
}
assign(paste0(i,"_df"), temp_df)
}
This is what I would like my output to look like:
UPDATE:
Came up with a solution that outputs what I need:
for(i in variables){
j = sym(i)
temp_df <- df %>% count(!!j)
temp_df$percent <- temp_df$n / sum(temp_df$n) * 10
strat_df <- temp_df %>%
left_join((df %>% group_by(!!j, strat1) %>% count(!!j) %>% pivot_wider(names_from = strat1, values_from = n)), by = i) %>%
left_join((df %>% group_by(!!j, strat2) %>% count(!!j) %>% pivot_wider(names_from = strat2, values_from = n)), by = i) %>%
left_join((df %>% group_by(!!j, strat3) %>% count(!!j) %>% pivot_wider(names_from = strat3, values_from = n)), by = i)
for(k in c("A","B","C","X","Y","True","False")){
strat_df[paste0(k, "_pct")] <- (strat_df[[k]] / temp_df$n) * 100
}
assign(paste0(i,"_df"), strat_df)
Either convert to symbol and evaluate (!!) or use across as the variables looped are strings
for(i in variables){
# Create a frequency table for the current variable
temp_df <- df %>% count(across(all_of(i)))
# Add a column for the percent of responses within each response category
temp_df$percent <- temp_df$n / sum(temp_df$n) * 100
# Add a column for the raw count for each category of the stratification variables
strat_df <- temp_df %>%
left_join((df %>% group_by(across(all_of(c(i, "strat1")))) %>%
count(across(all_of(i))) %>%
pivot_wider(names_from = strat1, values_from = n)), by = i) %>%
left_join((df %>% group_by(across(all_of(c(i, "strat2")))) %>%
count(across(all_of(i))) %>%
pivot_wider(names_from = strat2, values_from = n)), by = i) %>%
left_join((df %>% group_by(across(all_of(c(i, "strat3")))) %>%
count(across(all_of(i))) %>%
pivot_wider(names_from = strat3, values_from = n)), by = i)
# Add a column for the percent of the stratification variable category within the response category
for(j in c("A","B","C","X","Y","True","False")){
strat_df[paste0(j, "_pct")] <- (strat_df[[j]] / temp_df$n) * 100
}
assign(paste0(i,"_df"), strat_df)
}
-output
> var1_df
var1 n percent A B C X Y False True A_pct B_pct C_pct X_pct Y_pct True_pct False_pct
1 1 121 12.1 36 42 43 59 62 63 58 29.75207 34.71074 35.53719 48.76033 51.23967 47.93388 52.06612
2 2 144 14.4 51 42 51 84 60 69 75 35.41667 29.16667 35.41667 58.33333 41.66667 52.08333 47.91667
3 3 147 14.7 41 39 67 60 87 73 74 27.89116 26.53061 45.57823 40.81633 59.18367 50.34014 49.65986
4 4 146 14.6 52 45 49 74 72 79 67 35.61644 30.82192 33.56164 50.68493 49.31507 45.89041 54.10959
5 5 165 16.5 51 57 57 86 79 76 89 30.90909 34.54545 34.54545 52.12121 47.87879 53.93939 46.06061
6 6 133 13.3 48 51 34 64 69 68 65 36.09023 38.34586 25.56391 48.12030 51.87970 48.87218 51.12782
7 7 144 14.4 53 44 47 67 77 73 71 36.80556 30.55556 32.63889 46.52778 53.47222 49.30556 50.69444
> var2_df
var2 n percent A B C X Y False True A_pct B_pct C_pct X_pct Y_pct True_pct False_pct
1 1 152 15.2 51 53 48 79 73 70 82 33.55263 34.86842 31.57895 51.97368 48.02632 53.94737 46.05263
2 2 147 14.7 49 46 52 73 74 55 92 33.33333 31.29252 35.37415 49.65986 50.34014 62.58503 37.41497
3 3 142 14.2 46 45 51 72 70 79 63 32.39437 31.69014 35.91549 50.70423 49.29577 44.36620 55.63380
4 4 147 14.7 50 48 49 74 73 72 75 34.01361 32.65306 33.33333 50.34014 49.65986 51.02041 48.97959
5 5 128 12.8 45 43 40 59 69 72 56 35.15625 33.59375 31.25000 46.09375 53.90625 43.75000 56.25000
6 6 152 15.2 37 52 63 74 78 83 69 24.34211 34.21053 41.44737 48.68421 51.31579 45.39474 54.60526
7 7 132 13.2 54 33 45 63 69 70 62 40.90909 25.00000 34.09091 47.72727 52.27273 46.96970 53.03030
I have a list of about thousand data frames. All data frames have column Z and the column consists mostly on NA values, but whenever there is an actual value, it has either "VALUE1" or "VALUE2" in it. For example:
weight | height | Z
---------------------------
62 100 NA
65 89 NA
59 88 randomnumbersVALUE1randomtext
66 92 NA
64 90 NA
64 87 randomnumbersVALUE2randomtext
57 84 NA
The first actual value of each data frame in the column Z should always contain a value of "VALUE1" in it, so in the example data frame above everything is as it should be. However, if the data frame would look like this:
weight | height | Z
---------------------------
62 100 NA
65 89 NA
59 88 randomnumbersVALUE2randomtext
66 92 NA
64 90 NA
64 87 randomnumbersVALUE1randomtext
57 84 NA
I would need to add a new row into the beginning of the data frame with "VALUE1" in the Z column and value 0 in the height and weight columns. How could I do this for my list of data frames (with the help of functions such as add_row and filter)..?
If dfs is your list of dataframes, you can do this:
dfs = lapply(dfs, function(x) {
if(grepl("VALUE2", x[!is.na(x$Z),"Z"][1])) {
rbind(data.frame(weight=0,height=0,Z="VALUE1"),x)
} else x
})
library(dplyr)
dat %>%
filter(!is.na(Z)) %>%
slice(1) %>%
mutate(across(weight:height, ~ 0)) %>%
filter(!grepl("VALUE1", Z)) %>%
mutate(Z = "VALUE1") %>%
bind_rows(., dat)
# weight height Z
# 1 0 0 VALUE1
# 2 62 100 <NA>
# 3 65 89 <NA>
# 4 59 88 randomnumbersVALUE2randomtext
# 5 66 92 <NA>
# 6 64 90 <NA>
# 7 64 87 randomnumbersVALUE1randomtext
# 8 57 84 <NA>
Data
dat <- structure(list(weight = c(62L, 65L, 59L, 66L, 64L, 64L, 57L), height = c(100L, 89L, 88L, 92L, 90L, 87L, 84L), Z = c(NA, NA, "randomnumbersVALUE2randomtext", NA, NA, "randomnumbersVALUE1randomtext", NA)), class = "data.frame", row.names = c(NA, -7L))
I am working on a data set which is large and having many columns. I am using data.table to speed up the calculations. However at certain points I am not sure how to go about and convert my data.table back to data.frame and do the calculation. This slows up the process. It would help a lot to have suggestions on how I can write the below in data.table. Below is a snap of my code on a dummy data -
library(data.table)
#### set the seed value
set.seed(9901)
#### create the sample variables for creating the data
p01 <- sample(1:100,1000,replace = T)
p02 <- sample(1:100,1000,replace = T)
p03 <- sample(1:100,1000,replace = T)
p04 <- sample(1:100,1000,replace = T)
p05 <- sample(1:100,1000,replace = T)
p06 <- sample(1:100,1000,replace = T)
p07 <- sample(1:100,1000,replace = T)
#### create the data.table
data <- data.table(cbind(p01,p02,p03,p04,p05,p06,p07))
###user input for last column
lcol <- 6
###calculate start column as last - 3
scol <- lcol-3
###calculate average for scol:lcol
data <- data[,avg:= apply(.SD,1,mean,na.rm=T),.SDcols=scol:lcol]
###converting to data.frame since do not know the solution in data.table
data <- as.data.frame(data)
###calculate the trend in percentage
data$t01 <- data[,lcol-00]/data[,"avg"]-1
data$t02 <- data[,lcol-01]/data[,"avg"]-1
data$t03 <- data[,lcol-02]/data[,"avg"]-1
data$t04 <- data[,lcol-03]/data[,"avg"]-1
data$t05 <- data[,lcol-04]/data[,"avg"]-1
###converting back to data.table
data <- as.data.table(data)
###calculate the min and max for the trend
data1 <- data[,`:=` (trend_min = apply(.SD,1,min,na.rm=T),
trend_max = apply(.SD,1,max,na.rm=T)),.SDcols=c(scol:lcol)]
###calculate flag if any of t04 OR t05 is an outlier for min and max values. This would be many columns in actual data
data1$flag1 <- ifelse(data1$t04 < data1$trend_min | data1$t04 > data1$trend_max,1,0)
data1$flag2 <- ifelse(data1$t05 < data1$trend_min | data1$t05 > data1$trend_max,1,0)
data1$flag <- ifelse(data1$flag1 == 1 | data1$flag2 == 1,1,0)
So basically, how can I -
calculate the percentages based on user input of column index. Note it is not simple divide but percentage
How can I create the flag variable....I think I need to use any function but not sure how....
Some steps can be made more efficient, i.e. instead of using the apply with MARGIN = 1, the mean, min, max can be replaced with rowMeans, pmin, pmax
library(data.table)
data[ , avg:= rowMeans(.SD, na.rm = TRUE) ,.SDcols=scol:lcol]
data[, sprintf('t%02d', 1:5) := lapply(.SD, function(x) x/avg -1),
.SDcol = patterns("^p0[1-5]")]
data[,`:=` (trend_min = do.call(pmin, c(.SD,na.rm=TRUE)),
trend_max = do.call(pmax, c(.SD,na.rm=TRUE)) ),.SDcols=c(scol:lcol)]
data
# p01 p02 p03 p04 p05 p06 p07 avg t01 t02 t03 t04 t05 trend_min trend_max
# 1: 35 53 22 82 100 59 69 65.75 -0.46768061 -0.19391635 -0.6653992 0.24714829 0.5209125 22 100
# 2: 78 75 15 65 70 69 66 54.75 0.42465753 0.36986301 -0.7260274 0.18721461 0.2785388 15 70
# 3: 15 45 27 61 63 75 99 56.50 -0.73451327 -0.20353982 -0.5221239 0.07964602 0.1150442 27 75
# 4: 41 80 13 22 63 84 17 45.50 -0.09890110 0.75824176 -0.7142857 -0.51648352 0.3846154 13 84
# 5: 53 9 75 47 25 75 66 55.50 -0.04504505 -0.83783784 0.3513514 -0.15315315 -0.5495495 25 75
# ---
# 996: 33 75 9 61 74 55 57 49.75 -0.33668342 0.50753769 -0.8190955 0.22613065 0.4874372 9 74
# 997: 24 68 74 11 43 75 37 50.75 -0.52709360 0.33990148 0.4581281 -0.78325123 -0.1527094 11 75
# 998: 62 78 82 97 56 50 74 71.25 -0.12982456 0.09473684 0.1508772 0.36140351 -0.2140351 50 97
# 999: 70 88 93 4 39 75 93 52.75 0.32701422 0.66824645 0.7630332 -0.92417062 -0.2606635 4 93
#1000: 20 50 99 94 62 66 98 80.25 -0.75077882 -0.37694704 0.2336449 0.17133956 -0.2274143 62 99
and then create the 'flag'
data[, flag := +(Reduce(`|`, lapply(.SD, function(x)
x < trend_min| x > trend_max))), .SDcols = t04:t05]
I have something like this,
A B C
100 24
18
16
21
14
I am trying to write a function that calculates C = A-B for the respective row and then adds 20 to C which is A for the next row and repeats the step and it should be like this at the end.
A B C
100 24 76
96 18 78
98 16 82
102 21 81
101 14 87
I am doing it manually atm like
df$C[1] = df$A[1] - df$B[1] and then
df$A[2] = df$C[1]+20 and repeating it.
I would like to create a function instead of doing this way. Any help would be appreciated.
Here is another approach using for loop:
data
df <- data.frame(A=NA, B = c(24L, 18L, 16L, 21L, 14L),C=NA)
Initialize first row of df
df$A[1] <- 100
df$C[1] <- df$A[1]-df$B[1]
Populate the remaining rows of df
for (i in 1:(length(df$B)-1)){
df$C[i+1] <- df$C[i]-df$B[i+1]+20
df$A[i+1] <- df$C[i]+20
}
Output
df
A B C
1 100 24 76
2 96 18 78
3 98 16 82
4 102 21 81
5 101 14 87
We can start with only B column and then calculate A and C respectively.
start_value <- 100
df$A <- c(start_value, start_value - cumsum(df$B) + 20 * 1:nrow(df))[-(nrow(df) + 1)]
df$C <- df$A - df$B
df
# B A C
#1 24 100 76
#2 18 96 78
#3 16 98 82
#4 21 102 81
#5 14 101 87
data
df <- structure(list(B = c(24L, 18L, 16L, 21L, 14L)),
class = "data.frame", row.names = c(NA, -5L))
I have following dataframe in r
count1 count2 count3
0 12 11
12 13 44
22 32 13
I want to calculate distance between count2,count1 and count3 and count2 as follows
sqrt(abs(count2-count1) + abs(count3-count2))
to every row of dataframe. My desired dataframe is as follows
count1 count2 count3 distance
0 12 11 sqrt(abs(12-0)+abs(12-11))
12 13 44 sqrt(abs(13-12)+abs(44-13))
22 32 13 sqrt(abs(32-22)+abs(13-32))
the way I am doing it is with for loop
for(i in 1:nrow(df)){
df$distance[i] <- sqrt(abs(df$count1[i] - df$count2[i]) + abs(df$count2[i] - df$count3[i]))
}
Is there any better way of doing above ?
I guess the dplyr package is the way to go for that:
df <- data.frame(count1 = sample(1:100,10),count2 = sample(1:100,10),count3 = sample(1:100,10))
> df %>% mutate(distance=sqrt(abs(count2-count1) + abs(count3-count2)))
count1 count2 count3 distance
1 79 59 54 5.000000
2 70 18 22 7.483315
3 31 13 57 7.874008
4 54 49 53 3.000000
5 94 67 77 6.082763
6 51 74 21 8.717798
7 33 4 24 7.000000
8 90 79 78 3.464102
9 6 64 98 9.591663
10 22 68 28 9.273618
df$distance = apply(df, 1,
function(x) sqrt(abs(x[2] - x[1]) + abs(x[3] - x[2])))
df
We can just use base R
df$distance <- with(df, sqrt(abs(count2 - count1) + abs(count3 - count2)))
Or with rowSums from base R
df$distance <- sqrt(rowSums(abs(df[-1] - df[-length(df)])))
data
df <- structure(list(count1 = c(0L, 12L, 22L), count2 = c(12L, 13L,
32L), count3 = c(11L, 44L, 13L)), .Names = c("count1", "count2",
"count3"), class = "data.frame", row.names = c(NA, -3L))
You can also do with data.table package :
library(data.table)
y <- data.table(count1 = c(0,12,22), count2 = c(12,13,32), count3 = c(11,44,13))
y[, distance := sqrt(abs(count2 - count1) + abs(count3 - count2))]
Results :
> y
count1 count2 count3 distance
1: 0 12 11 3.605551
2: 12 13 44 5.656854
3: 22 32 13 5.385165
use dplyr package
pretty much the standard now
Here a working example using iris data (use dput(namedataset) to share your db)
library(dplyr)
iris[1:3] %>% mutate(res=sqrt(abs(Sepal.Length-Sepal.Width)))