I want to summarize data and create dynamic columns columns and store in different data frame:
data is something like:
col1 col2 col3
A 1 200
B 1 300
A 2 400
k=c("A","B","C")
for(i in k)
{
group_data <- group_by(data[data$col1==i,], col2)
summary_i<- summarize(group_data ,paste("var",k[i],sep="_") = n())
}
Expected output:
Three data frame with name summary_A, summary_B, summary_C containing variable var_A, var_B and var_C respectively.
As correctly pointed out by #MrFlick, there are better ways to manage your problem.
Anyway, here is a working version of your code:
data <- structure(list(col1 = structure(c(1L, 2L, 1L), .Label = c("A",
"B"), class = "factor"), col2 = c(1L, 1L, 2L), col3 = c(200L,
300L, 400L)), .Names = c("col1", "col2", "col3"), class = "data.frame", row.names = c(NA,
-3L))
k=c("A","B","C")
for (i in seq_along(k)) {
group_data <- group_by(data[data$col1==k[i],], col2)
vark <- paste('var',i,sep='_')
eval(parse(text=paste("summary_",i," <- summarize(group_data,", vark, " = n())",sep="")))
}
print(summary_1)
# A tibble: 2 x 2
# col2 var_1
# <int> <int>
# 1 1 1
# 2 2 1
print(summary_2)
# A tibble: 1 x 2
# col2 var_2
# <int> <int>
# 1 1 1
print(summary_3)
# A tibble: 0 x 2
# ... with 2 variables: col2 <int>, var_3 <int>
Related
I would need help in order to add count column into a table called tab1 according to another tab2.
Here is the first tab :
tab1
Event_Groups Other_column
1 1_G1,2_G2 A
2 2_G1 B
3 4_G4 C
4 7_G5,8_G5,9_G5 D
as you can see in Event_Groups column I have 2 information (Event and Groups numbers separated by a "_"). These informations will also be found in tab2$Group and tab2$Event and the idea is for each element within rows in tab1 (separated by a comma) , to count the number of rows within tab2 where VALUE1 < 10 AND VALUE2 > 30 and then add this count into tab1 in a new column called Sum_count.
Here is the
tab2
Group Event VALUE1 VALUE2
1 G1 1 5 50 <- VALUE1 < 10 & VALUE2 > 30 : count 1
2 G1 2 6 20 <- VALUE2 < 30 : count 0
3 G2 2 50 50 <- VALUE1 > 10 : count 0
4 G3 3 0 0
5 G4 1 0 0
6 G4 4 2 40 <- VALUE1 < 10 & VALUE2 > 30 : count 1
7 G5 7 1 70 <- VALUE1 < 10 & VALUE2 > 30 : count 1
8 G5 8 4 67 <- VALUE1 < 10 & VALUE2 > 30 : count 1
9 G5 9 3 60 <- VALUE1 < 10 & VALUE2 > 30 : count 1
Example :
For instance for the first element of row1 in tab1: 1_G1
we see in tab2 (row1) that VALUE1 < 10 & VALUE2 > 30, so I count 1.
For the seconde element (row1) : 2_G2 we see in tab2 (row3) that VALUE1 > 10, so I count 0.
And here is the expected result tab1 dataframe;
Event_Groups Other_column Sum_count
1_G1,2_G2 A 1
2_G1 B 0
4_G4 C 1
7_G5,8_G5,9_G5 D 3
I dot not know if I am clear enough, do not hesitate to ask questions.
Here are the two tables in dput format if it can helps:
tab1
structure(list(Event_Groups = structure(1:4, .Label = c("1_G1,2_G2",
"2_G1", "4_G4", "7_G5,8_G5,9_G5"), class = "factor"), Other_column =
structure(1:4, .Label = c("A", "B", "C", "D"), class = "factor")),
class = "data.frame", row.names = c(NA,
-4L))
tab2
structure(list(Group = structure(c(1L, 1L, 2L, 3L, 4L, 4L, 5L,
5L, 5L), .Label = c("G1", "G2", "G3", "G4", "G5"), class = "factor"),
Event = c(1L, 2L, 2L, 3L, 1L, 4L, 7L, 8L, 9L), VALUE1 = c(5L,
6L, 50L, 0L, 0L, 2L, 1L, 4L, 3L), VALUE2 = c(50, 20, 50,
0, 0, 40, 70, 67, 60)), class = "data.frame", row.names = c(NA,
-9L))
Here is one way to do it:
library(dplyr)
library(tidyr)
tab1 %>%
mutate(Event_Groups = as.character(Event_Groups)) %>%
separate_rows(Event_Groups, sep = ",") %>%
left_join(.,
tab2 %>%
unite(col = "Event_Groups", Event, Group) %>%
mutate(count = if_else(VALUE1 < 10 & VALUE2 > 30,1L, 0L))) %>%
group_by(Other_column) %>%
summarise(Event_Groups = paste(unique(Event_Groups), collapse = ","),
Sum_count = sum(count)) %>%
select(Event_Groups, everything())
#> Joining, by = "Event_Groups"
#> `summarise()` ungrouping output (override with `.groups` argument)
#> # A tibble: 4 x 3
#> Event_Groups Other_column Sum_count
#> <chr> <fct> <int>
#> 1 1_G1,2_G2 A 1
#> 2 2_G1 B 0
#> 3 4_G4 C 1
#> 4 7_G5,8_G5,9_G5 D 3
Created on 2021-07-29 by the reprex package (v0.3.0)
You can try a tidyverse
library(tidyverse)
tab1 %>%
rownames_to_column() %>%
separate_rows(Event_Groups, sep = ",") %>%
separate(Event_Groups, into = c("Event", "Group"), sep="_", convert = T) %>%
left_join(tab2 %>%
mutate(count = as.numeric(VALUE1 < 10 & VALUE2 > 30)),
by = c("Event", "Group")) %>%
unite(Event_Groups, Event, Group) %>%
group_by(rowname) %>%
summarise(Event_Groups = toString(Event_Groups),
Other_column = unique(Other_column),
count =sum(count))
# A tibble: 4 x 4
rowname Event_Groups Other_column count
<chr> <chr> <chr> <dbl>
1 1 1_G1, 2_G2 A 1
2 2 2_G1 B 0
3 3 4_G4 C 1
4 4 7_G5, 8_G5, 9_G5 D 3
I have a dataframe with multiple columns
col1|col2|col3|colA|colB|colC|Percent
1 1 1 2 2 2 50
Earlier I subset the columns and created a vector
ColAlphabet<-c("ColA","ColB","ColC")
What i want to do is take ColAlphabet and multiply it by Percent so in the end I have
col1|col2|col3|colA|colB|colC|Percent
1 1 1 1 1 1 50
We can use mutate with across. Specify the columns of interest wrapped with all_of and multiply the columns with 'Percent'
library(dplyr)
df2 <- df1 %>%
mutate(across(all_of(ColAlphabet), ~ .* Percent/100))
-output
df2
# col1 col2 col3 colA colB colC Percent
#1 1 1 1 1 1 1 50
data
df1 <- structure(list(col1 = 1L, col2 = 1L, col3 = 1L, colA = 2L, colB = 2L,
colC = 2L, Percent = 50L), class = "data.frame", row.names = c(NA,
-1L))
You can subset the column, multiply with Percent and save it in ColAlphabet again.
ColAlphabet<-c("colA","colB","colC")
df[ColAlphabet] <- df[ColAlphabet] * df$Percent/100
df
# col1 col2 col3 colA colB colC Percent
#1 1 1 1 1 1 1 50
We can also use apply():
#Vector
ColAlphabet<-c("colA","colB","colC")
#Code
df[,ColAlphabet] <- apply(df[,ColAlphabet],2,function(x) x*df$Percent/100)
Output:
df
col1 col2 col3 colA colB colC Percent
1 1 1 1 1 1 1 50
Some data used:
#Data
df <- structure(list(col1 = 1L, col2 = 1L, col3 = 1L, colA = 2L, colB = 2L,
colC = 2L, Percent = 50L), class = "data.frame", row.names = c(NA,
-1L))
In case if you want to multiply directly:
> df <- data.frame(col1 = 1, col2 = 1, col3 = 1, colA = 2, colB = 2, colC = 2, Percent = 50)
> df
col1 col2 col3 colA colB colC Percent
1 1 1 1 2 2 2 50
> df[grep('^c.*[A-Z]$', names(df))] <- df[grep('^c.*[A-Z]$', names(df))] * df$Percent/100
> df
col1 col2 col3 colA colB colC Percent
1 1 1 1 1 1 1 50
>
This question already has an answer here:
dplyr::first() to choose first non NA value
(1 answer)
Closed 2 years ago.
I understand we can use the dplyr function coalesce() to unite different columns, but is there such function to unite rows?
I am struggling with a confusing incomplete/doubled dataframe with duplicate rows for the same id, but with different columns filled. E.g.
id sex age source
12 M NA 1
12 NA 3 1
13 NA 2 2
13 NA NA NA
13 F 2 NA
and I am trying to achieve:
id sex age source
12 M 3 1
13 F 2 2
You can try:
library(dplyr)
#Data
df <- structure(list(id = c(12L, 12L, 13L, 13L, 13L), sex = structure(c(2L,
NA, NA, NA, 1L), .Label = c("F", "M"), class = "factor"), age = c(NA,
3L, 2L, NA, 2L), source = c(1L, 1L, 2L, NA, NA)), class = "data.frame", row.names = c(NA,
-5L))
df %>%
group_by(id) %>%
fill(everything(), .direction = "down") %>%
fill(everything(), .direction = "up") %>%
slice(1)
# A tibble: 2 x 4
# Groups: id [2]
id sex age source
<int> <fct> <int> <int>
1 12 M 3 1
2 13 F 2 2
As mentioned by #A5C1D2H2I1M1N2O1R2T1 you can select the first non-NA value in each group. This can be done using dplyr :
library(dplyr)
df %>% group_by(id) %>% summarise(across(.fns = ~na.omit(.)[1]))
# A tibble: 2 x 4
# id sex age source
# <int> <fct> <int> <int>
#1 12 M 3 1
#2 13 F 2 2
Base R :
aggregate(.~id, df, function(x) na.omit(x)[1], na.action = 'na.pass')
Or data.table :
library(data.table)
setDT(df)[, lapply(.SD, function(x) na.omit(x)[1]), id]
I am trying to prep my data and I am stuck with one issue. Lets say I have the following data frame:
df1
Name C1 Val1
A a x1
A a x2
A b x3
A c x4
B d x5
B d x6
...
and I want to narrow down the df to
df2
Name C1 Val
A a,b,c x1+x2+x3+x4
B d x5+x6
...
while a is a character value and x is numeric value
I have been trying using sapply, rowsum and
df2<- aggregate(df1, list(df1[,1]), FUN= summary)
but it just can't put the character values in a list for each Name.
Can someone help me how to receive df2?
m <- function(x) if(is.numeric(x<- type.convert(x)))sum(x) else toString(unique(x))
aggregate(.~Name,df1,m)
Name C1 Val1
1 A a, b, c 10
2 B d 11
where
df1
Name C1 Val1
1 A a 1
2 A a 2
3 A b 3
4 A c 4
5 B d 5
6 B d 6
This is your df, I give it numbers 1 to 6 in Val1
df <-
structure(list(Name = structure(c(1L, 1L, 1L, 1L, 2L, 2L), .Label = c("A",
"B"), class = "factor"), C1 = structure(c(1L, 1L, 2L, 3L, 4L,
4L), .Label = c("a", "b", "c", "d"), class = "factor"), Val1 = 1:6), row.names = c(NA,
-6L), class = "data.frame")
We just use summarise:
df %>%
group_by(Name) %>%
summarise(C1=paste(unique(C1),collapse=","),Val1=sum(Val1))
# A tibble: 2 x 3
Name C1 Val1
<fct> <chr> <int>
1 A a,b,c 10
2 B d 11
Quick and easy dplyr solution:
library(dplyr)
library(stringr)
df1 %>%
mutate(Val1_num = as.numeric(str_extract(Val1, "\\d+"))) %>%
group_by(Name) %>%
summarise(C1 = paste(unique(C1), collapse = ","),
Val1 = paste(unique(Val1), collapse = "+"),
Val1_num = sum(Val1_num))
#> # A tibble: 2 x 4
#> Name C1 Val1 Val1_num
#> <chr> <chr> <chr> <dbl>
#> 1 A a,b,c x1+x2+x3+x4 10
#> 2 B d x5+x6 11
Or in base:
df2 <- aggregate(df1, list(df1[,1]), FUN = function(x) {
if (all(grepl("\\d", x))) {
sum(as.numeric(gsub("[^[:digit:]]", "", x)))
} else {
paste(unique(x), collapse = ",")
}
})
df2
#> Group.1 Name C1 Val1
#> 1 A A a,b,c 10
#> 2 B B d 11
data
df1 <- read.csv(text = "
Name,C1,Val1
A,a,x1
A,a,x2
A,b,x3
A,c,x4
B,d,x5
B,d,x6", stringsAsFactors = FALSE)
Hi all I have got a dataframe. I need to create another column so that it should tell at what place each categories are there. For example PLease refer expected output
df
ColB ColA
X A>B>C
U B>C>A
Z C>A>B
Expected output
df1
ColB ColA A B C
X A>B>C 1 2 3
U B>C>A 3 1 2
Z C>A>B 2 3 1
We can first bring ColA into separate rows, group_by ColB and give an unique row number for each entry and then convert the data into wide format using pivot_wider.
library(dplyr)
library(tidyr)
df %>%
mutate(ColC = ColA) %>%
separate_rows(ColC, sep = ">") %>%
group_by(ColB) %>%
mutate(row = row_number()) %>%
pivot_wider(names_from = ColC, values_from = row)
# ColB ColA A B C
# <fct> <fct> <int> <int> <int>
#1 X A>B>C 1 2 3
#2 U B>C>A 3 1 2
#3 Z C>A>B 2 3 1
data
df <- structure(list(ColB = structure(c(2L, 1L, 3L), .Label = c("U",
"X", "Z"), class = "factor"), ColA = structure(1:3, .Label = c("A>B>C",
"B>C>A", "C>A>B"), class = "factor")), class = "data.frame", row.names = c(NA, -3L))
We can do this in base R
df[LETTERS[1:3]] <- t(sapply(regmatches(df$ColA, gregexpr("[A-Z]",
df$ColA)), match, x = LETTERS[1:3]))
df
# ColB ColA A B C
#1 X A>B>C 1 2 3
#2 U B>C>A 3 1 2
#3 Z C>A>B 2 3 1
data
df <- structure(list(ColB = structure(c(2L, 1L, 3L), .Label = c("U",
"X", "Z"), class = "factor"), ColA = structure(1:3, .Label = c("A>B>C",
"B>C>A", "C>A>B"), class = "factor")), class = "data.frame",
row.names = c(NA,
-3L))