Make a new column for every variable and tally [duplicate] - r

This question already has answers here:
R Split delimited strings in a column and insert as new column (in binary) [duplicate]
(3 answers)
Closed 4 months ago.
I have the following dataframe:
sample name
1 a cobra, tiger, reptile
2 b tiger, spynx
3 c reptile, cobra
4 d sphynx, tiger
5 e cat, dog, tiger
6 f dog, spynx
and what I want to make from that is.
sample cobra tiger spynx reptile cat dog
1 a 1 1 0 1 0 0
2 b 0 1 1 0 0 0
3 c 1 0 0 1 0 0
4 d 0 1 1 0 0 0
5 e 0 1 0 0 1 1
6 f 0 0 1 0 1 1
so basically make a new column out of all the variables that are in the column: name. and put a 1 if a value is present in the df$name and 0 if it is not present.
all <- unique(unlist(strsplit(as.character(df$name), ", ")))
all <- all[!is.na(all)]
for(i in df){
df[i]<- 0 }
this gives me all the variables as 0's, and now I want to match it to the name column, and if it is present make a 1 out of the 0
How would you approach this?

With tidyr and dplyr...
library(tidyr)
library(dplyr, warn = FALSE)
df1 |>
separate_rows(name) |>
group_by(sample, name) |>
summarise(count = n(), .groups = "drop") |>
pivot_wider(names_from = "name", values_from = "count", values_fill = 0)
#> # A tibble: 6 × 8
#> sample cobra reptile tiger spynx sphynx cat dog
#> <chr> <int> <int> <int> <int> <int> <int> <int>
#> 1 a 1 1 1 0 0 0 0
#> 2 b 0 0 1 1 0 0 0
#> 3 c 1 1 0 0 0 0 0
#> 4 d 0 0 1 0 1 0 0
#> 5 e 0 0 1 0 0 1 1
#> 6 f 0 0 0 1 0 0 1
Created on 2022-10-19 with reprex v2.0.2
data
df1 <- data.frame(sample = letters[1:6],
name = c("cobra, tiger, reptile",
"tiger, spynx",
"reptile, cobra",
"sphynx, tiger",
"cat, dog, tiger",
"dog, spynx"))

Related

Create presence/absence variables from character string for long data

Let's say I have a data frame like this:
dat<- data.frame(ID= rep(c("A","B","C","D"),4),
test= rep(c("pre","post"),8),
item= c(rep("item1",8),rep("item2",8))
answer= c("undergraduateeducation_graduateorprofessionalschool_employment",
"graduateorprofessionalschool",
"undergraduateeducation_graduateorprofessionalschool_employment_volunteeractivityoroutreach",
"volunteeractivityoroutreach",
"undergraduateeducation_employment_volunteeractivityoroutreach",
"employment",
"volunteeractivityoroutreach",
"undergraduateeducation_graduateorprofessionalschool_employment_volunteeractivityoroutreach",
"undergraduateeducation_graduateorprofessionalschool_employment",
"graduateorprofessionalschool",
"undergraduateeducation_graduateorprofessionalschool_employment_volunteeractivityoroutreach",
"volunteeractivityoroutreach",
"undergraduateeducation_employment_volunteeractivityoroutreach",
"employment",
"volunteeractivityoroutreach",
"undergraduateeducation_graduateorprofessionalschool_employment_volunteeractivityoroutreach"))
The answer column represents a "select all the apply" answer type- where the underscore separates selected answer options. For each ID, test and item, I would like to change this single variable to multiple presence/absence variables indicating the presence or absence of that answer component in the string. 1 indicates that answer option was present in the respondents answer and 0 represents that component was absent. The variables undergraduate, graduate, employment and volunteer in res correspond to the following strings in answer, respectivley: undergraduateeducation, graduateorprofessionalschool,employment, volunteeractivityoroutreach. White spaces were removed.
The result data frame would look as follows:
res<- data.frame(ID= rep(c("A","B","C","D"),4),
test= rep(c("pre","post"),8),
item= c(rep("item1",8),rep("item2",8)),
undergraduate= c(1,0,1,0,1,0,0,1,1,0,1,0,1,0,0,1),
graduate= c(1,1,1,0,0,0,0,1,1,1,1,0,0,0,0,1),
employment=c(1,0,1,0,1,1,0,1,1,0,1,0,1,1,0,1),
volunteer=c(0,0,1,1,1,0,1,1,0,0,1,1,1,0,1,1))
In base R you could do:
new_cols <- c('undergraduate', 'graduate', 'employment', 'volunteer')
cbind(dat[1:3],
as.data.frame(do.call(rbind, lapply(strsplit(dat$answer, "_"),
function(x) {
z <- sapply(new_cols, function(y) as.numeric(grepl(paste0("\\b", y), x)))
if(is.vector(z)) z else colSums(z)
}))))
#> ID test item undergraduate graduate employment volunteer
#> 1 A pre item1 1 1 1 0
#> 2 B post item1 0 1 0 0
#> 3 C pre item1 1 1 1 1
#> 4 D post item1 0 0 0 1
#> 5 A pre item1 1 0 1 1
#> 6 B post item1 0 0 1 0
#> 7 C pre item1 0 0 0 1
#> 8 D post item1 1 1 1 1
#> 9 A pre item2 1 1 1 0
#> 10 B post item2 0 1 0 0
#> 11 C pre item2 1 1 1 1
#> 12 D post item2 0 0 0 1
#> 13 A pre item2 1 0 1 1
#> 14 B post item2 0 0 1 0
#> 15 C pre item2 0 0 0 1
#> 16 D post item2 1 1 1 1
Created on 2022-05-05 by the reprex package (v2.0.1)
One option is to use tidyverse to separate the data into rows on the _, then keep only the keywords (which will be used for column names). Then, we create a value column to note presence, then we can pivot to wide format, and fill the other values with 0.
library(tidyverse)
result <- dat %>%
mutate(rn = row_number()) %>%
separate_rows(answer, sep = "_") %>%
mutate(answer = str_extract(answer, "undergraduate|graduate|employment|volunteer"),
value = 1) %>%
pivot_wider(names_from = "answer", values_from = "value", values_fill = 0) %>%
select(-rn)
Output
ID test item undergraduate graduate employment volunteer
<chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 A pre item1 1 1 1 0
2 B post item1 0 1 0 0
3 C pre item1 1 1 1 1
4 D post item1 0 0 0 1
5 A pre item1 1 0 1 1
6 B post item1 0 0 1 0
7 C pre item1 0 0 0 1
8 D post item1 1 1 1 1
9 A pre item2 1 1 1 0
10 B post item2 0 1 0 0
11 C pre item2 1 1 1 1
12 D post item2 0 0 0 1
13 A pre item2 1 0 1 1
14 B post item2 0 0 1 0
15 C pre item2 0 0 0 1
16 D post item2 1 1 1 1
Test
identical(result, as_tibble(res))
#[1] TRUE

How can I create a columns with the values in other column (R)?

I would like to create a table that assigns a column to each value of one column.
The data looks like this:
Person Task
John 4
Michael 1
Florence 3
Expected result:
Person Task 1 2 3 4 5 6 7 8
John 4 1 1 1 1 0 0 0 0
Michael 1 0 0 0 0 1 0 0 0
Florence 3 0 0 0 0 0 1 1 1
It's important that the column values ​​are filled in an orderly manner. The first row, then the second and so on.
Thanks!
in base R:
cbind(df, t(unname(model.matrix(~with(df, factor(rep(Person, Task), Person))-1))))
Person Task 1 2 3 4 5 6 7 8
1 John 4 1 1 1 1 0 0 0 0
2 Michael 1 0 0 0 0 1 0 0 0
3 Florence 3 0 0 0 0 0 1 1 1
For easier code:
Create a dataframe that lookes like below:
df1 <- with(df, data.frame(lengths = Task, values = factor(Person, Person)))
df1
lengths values
1 4 John
2 1 Michael
3 3 Florence
Note that values is now a factor column with levels same as the values.
Then you could simply do:
cbind(df, t(unname(model.matrix(~inverse.rle(df1)-1))))
Person Task 1 2 3 4 5 6 7 8
1 John 4 1 1 1 1 0 0 0 0
2 Michael 1 0 0 0 0 1 0 0 0
3 Florence 3 0 0 0 0 0 1 1 1
You could use
library(dplyr)
library(tidyr)
df %>%
uncount(Task, .remove = FALSE) %>%
mutate(rn = row_number(),
value = 1) %>%
pivot_wider(c(Person, Task),
names_from = rn,
values_from = value,
values_fill = 0)
This returns
# A tibble: 3 x 10
Person Task `1` `2` `3` `4` `5` `6` `7` `8`
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 John 4 1 1 1 1 0 0 0 0
2 Michael 1 0 0 0 0 1 0 0 0
3 Florence 3 0 0 0 0 0 1 1 1
Setting up your dataframe:
> df <- data.frame(Name=factor(c("John", "Michael", "Florence"), levels=c("John", "Michael", "Florence")), Task=c(4,1,3))
> df
Name Task
1 John 4
2 Michael 1
3 Florence 3
First I will make a 'long' dataframe, expanding each name and task by the number of entries needed. The id will make sure that when I reshape the dataframe wide the columns have the correct names:
df2 <- data.frame(Name=rep(df$Name, df$Task),
Task=rep(df$Task, df$Task),
id = 1:sum(df$Task))
> df2
Name Task id
1 John 4 1
2 John 4 2
3 John 4 3
4 John 4 4
5 Michael 1 5
6 Florence 3 6
7 Florence 3 7
8 Florence 3 8
Now I can reshape wide using the dcast function from reshape2
reshape2::dcast(df2, Name+Task ~ id, fun.aggregate = length, value.var="id")
Name Task 1 2 3 4 5 6 7 8
1 John 4 1 1 1 1 0 0 0 0
2 Michael 1 0 0 0 0 1 0 0 0
3 Florence 3 0 0 0 0 0 1 1 1
A solution with diag(), since the added values have the shape of a diagonal matrix, just with repeated columns:
n <- length(dat$Task)
cbind( dat, matrix( unlist( apply( rbind(as.integer(dat$Task), 1:n), 2,
function(x) rep(diag(n)[,x[2]], x[1]) ) ), n ) )
Person Task 1 2 3 4 5 6 7 8
1 John 4 1 1 1 1 0 0 0 0
2 Michael 1 0 0 0 0 1 0 0 0
3 Florence 3 0 0 0 0 0 1 1 1
Data:
dat <- structure(list(Person = c("John", "Michael", "Florence"), Task = c(4L,
1L, 3L)), class = "data.frame", row.names = c(NA, -3L))

R Long to wide with count and sum [duplicate]

This question already has an answer here:
R: Reshaping Multiple Columns from Long to Wide
(1 answer)
Closed 2 years ago.
I have a data as below:
#dt
Method ID Source Amt
A 1 X 10
A 1 Y 20
C 1 Z 30
B 2 Y 15
D 2 Z 10
C 3 X 20
D 3 X 20
E 4 Z 10
E 4 Z 10
What I want is:
ID Total_Amt Method_A Method_B Method_C Method_D Method_E Source_X Source_Y Source_Z
1 60 2 0 1 0 0 1 1 1
2 25 0 1 0 1 0 0 1 1
3 40 0 0 1 1 0 2 0 0
4 20 0 0 0 0 2 0 0 2
For the Method and Source columns, I want to calculate the count by their ID and use dcast to transform to wide format and also add up Amt column by ID.
Any Help?
Here's one way using dplyr and tidyr libraries. We first calculate sum of Amt values for each ID, get the data in long format, count number of rows and get it back in wide format.
library(dplyr)
library(tidyr)
df %>%
group_by(ID) %>%
mutate(Amt = sum(Amt)) %>%
pivot_longer(cols = c(Method, Source)) %>%
count(ID, value, Amt, name) %>%
pivot_wider(names_from = c(name, value), values_from = n, values_fill = 0)
# ID Amt Method_A Method_C Source_X Source_Y Source_Z Method_B Method_D Method_E
# <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
#1 1 60 2 1 1 1 1 0 0 0
#2 2 25 0 0 0 1 1 1 1 0
#3 3 40 0 1 2 0 0 0 1 0
#4 4 20 0 0 0 0 2 0 0 2

Only Use The First Match For Every N Rows

I have a data.frame that looks like this.
Date Number
1 1
2 0
3 1
4 0
5 0
6 1
7 0
8 0
9 1
I would like to create a new column that puts a 1 in the column if it is the first 1 of every 3 rows. Otherwise put a 0. For example, this is how I would like the new data.frame to look
Date Number New
1 1 1
2 0 0
3 1 0
4 0 0
5 0 0
6 1 1
7 0 0
8 0 0
9 1 1
Every three rows we find the first 1 and populate the column otherwise we place a 0. Thank you.
Hmm, at first glance I thought Akrun answer provided me the solution. However, it is not exactly what I am looking for. Here is what #akrun solution provides.
df1 = data.frame(Number = c(1,0,1,0,1,1,1,0,1,0,0,0))
head(df1,9)
Number
1 1
2 0
3 1
4 0
5 1
6 1
7 1
8 0
9 1
Attempt at solution:
df1 %>%
group_by(grp = as.integer(gl(n(), 3, n()))) %>%
mutate(New = +(Number == row_number()))
Number grp New
<dbl> <int> <int>
1 1 1 1
2 0 1 0
3 1 1 0
4 0 2 0
5 1 2 0 #should be a 1
6 1 2 0
7 1 3 1
8 0 3 0
9 1 3 0
As you can see the code misses the one on row 5. I am looking for the first 1 in every chunk. Then everything else should be 0.
Sorry if i was unclear akrn
Edit** Akrun new answer is exactly what I am looking for. Thank you very much
Here is an option to create a grouping column with gl and then do a == with the row_number on the index of matched 1. Here, match will return only the index of the first match.
library(dplyr)
df1 %>%
group_by(grp = as.integer(gl(n(), 3, n()))) %>%
mutate(New = +(row_number() == match(1, Number, nomatch = 0)))
# A tibble: 12 x 3
# Groups: grp [4]
# Number grp New
# <dbl> <int> <int>
# 1 1 1 1
# 2 0 1 0
# 3 1 1 0
# 4 0 2 0
# 5 1 2 1
# 6 1 2 0
# 7 1 3 1
# 8 0 3 0
# 9 1 3 0
#10 0 4 0
#11 0 4 0
#12 0 4 0
Looking at the logic, perhaps you want to check if Number == 1 and that the prior 2 values were both 0. If that is not correct please let me know.
library(dplyr)
df %>%
mutate(New = ifelse(Number == 1 & lag(Number, n = 1L, default = 0) == 0 & lag(Number, n = 2L, default = 0) == 0, 1, 0))
Output
Date Number New
1 1 1 1
2 2 0 0
3 3 1 0
4 4 0 0
5 5 0 0
6 6 1 1
7 7 0 0
8 8 0 0
9 9 1 1
You can replace Number value to 0 except for the 1st occurrence of 1 in each 3 rows.
library(dplyr)
df %>%
group_by(gr = ceiling(row_number()/3)) %>%
mutate(New = replace(Number, -which.max(Number), 0)) %>%
#Or to be safe and specific use
#mutate(New = replace(Number, -which(Number == 1)[1], 0)) %>%
ungroup() %>% select(-gr)
# A tibble: 9 x 3
# Date Number New
# <int> <int> <int>
#1 1 1 1
#2 2 0 0
#3 3 1 0
#4 4 0 0
#5 5 0 0
#6 6 1 1
#7 7 0 0
#8 8 0 0
#9 9 1 1

How to separate with unequal column (reverse toString) in dplyr

I'm working with survey data trying to multiple responses in a single column. The problem is that there may be 1-5 answers, separated with commas.
How do I turn this:
df <- data.frame(
splitThis = c("A,B,C","B,C","A,C","A","B","C")
)
> df
splitThis
1 A,B,C
2 B,C
3 A,C
4 A
5 B
6 C
Into this:
intoThis <- data.frame(
A = c(1,0,1,1,0,0),
B = c(1,1,0,0,1,0),
c = c(1,1,1,0,0,1)
)
> intoThis
A B c
1 1 1 1
2 0 1 1
3 1 0 1
4 1 0 0
5 0 1 0
6 0 0 1
Any wrangling help appreciated!
We can use mtabulate from qdapTools after splitting by ,
library(qdapTools)
mtabulate(strsplit(as.character(df$splitThis), ","))
# A B C
#1 1 1 1
#2 0 1 1
#3 1 0 1
#4 1 0 0
#5 0 1 0
#6 0 0 1
As the OP also mentioned dplyr/tidyr
library(dplyr)
library(tidyr)
library(tibble)
rownames_to_column(df, "rn") %>%
separate_rows(splitThis) %>%
table()
Or using tidyverse packages
rownames_to_column(df, "rn") %>%
separate_rows(splitThis) %>%
group_by(rn, splitThis) %>%
tally %>%
spread(splitThis, n, fill=0) %>%
ungroup() %>%
select(-rn)
# A tibble: 6 × 3
# A B C
#* <dbl> <dbl> <dbl>
#1 1 1 1
#2 0 1 1
#3 1 0 1
#4 1 0 0
#5 0 1 0
#6 0 0 1

Resources