one hot encoding dirty column in R dplyr - r

I have a column like so. The column begins and ends with a ',' and each value is separated by ',,'.
col1
,101,,9,,201,,200,
,201,,101,,102,
,9,,101,,102,,200,,201,
,101,,200,,9,,102,,102,
How can i transform this column into the following:
col1_9 col1_101 col1_102 col1_200 col1_201
1 1 0 1 1
0 1 1 0 1
1 1 1 1 1
1 1 2 1 0

df%>%
mutate(rowid = row_number(), value = 1)%>%
separate_rows(col1)%>%
filter(nzchar(col1)) %>%
pivot_wider(rowid, names_from = col1,
values_fn = sum, names_prefix = 'col1_',
values_fill = 0)
# A tibble: 4 x 6
rowid col1_101 col1_9 col1_201 col1_200 col1_102
<int> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 1 1 1 1 0
2 2 1 0 1 0 1
3 3 1 1 1 1 1
4 4 1 1 0 1 2
in Base R:
a <- setNames(strsplit(trimws(df$col1,white=','), ',+'), seq(nrow(df)))
as.data.frame.matrix(t(table(stack(a))))
101 102 200 201 9
1 1 0 1 1 1
2 1 1 0 1 0
3 1 1 1 1 1
4 1 2 1 0 1

An option could be:
First remove the "," at begin and end using str_sub from stringr
One Hot encode the column using mtabulate and strsplit with sep of ",,"
Order the column names based on number
Finally, give the columns the "col1_" names using paste0
Which gives this as result:
df <- read.table(text = "col1
,101,,9,,201,,200,
,201,,101,,102,
,9,,101,,102,,200,,201,
,101,,200,,9,,102,,102,", header = TRUE)
library(stringr)
library(qdapTools)
df$col1 <- str_sub(df$col1, 2, -2)
df <- mtabulate(strsplit(df$col1, ",,"))
df <- df[, order(as.numeric(names(df)))]
names(df) <- paste0("col1_", names(df))
df
#> col1_9 col1_101 col1_102 col1_200 col1_201
#> 1 1 1 0 1 1
#> 2 0 1 1 0 1
#> 3 1 1 1 1 1
#> 4 1 1 2 1 0
Created on 2022-07-21 by the reprex package (v2.0.1)

Related

Counting Frequencies of Sequences

Suppose there are two students - each student takes an exam multiple times (e.g.result_id = 1 is the first exam, result_id = 2 is the second exam, etc.). The student can either "pass" (1) or "fail" (0).
The data looks something like this:
library(data.table)
my_data = data.frame(id = c(1,1,1,1,1,1,2,2,2,2,2,2,2,2,2), results = c(0,1,0,1,0,0,1,1,1,0,1,1,0,1,0), result_id = c(1,2,3,4,5,6,1,2,3,4,5,6,7,8,9))
my_data = setDT(my_data)
id results result_id
1: 1 0 1
2: 1 1 2
3: 1 0 3
4: 1 1 4
5: 1 0 5
6: 1 0 6
7: 2 1 1
8: 2 1 2
9: 2 1 3
10: 2 0 4
11: 2 1 5
12: 2 1 6
13: 2 0 7
14: 2 1 8
15: 2 0 9
I am interested in counting the number of times that a student passes an exam, given that the student passed the previous two exams.
I tried to do this with the following code:
my_data$current_exam = shift(my_data$results, 0)
my_data$prev_exam = shift(my_data$results, 1)
my_data$prev_2_exam = shift(my_data$results, 2)
# Count the number of exam results for each record
out <- my_data[!is.na(prev_exam), .(tally = .N), by = .(id, current_exam, prev_exam, prev_2_exam)]
out = na.omit(out)
My code produces the following results:
> out
id current_exam prev_exam prev_2_exam tally
1: 1 0 1 0 2
2: 1 1 0 1 1
3: 1 0 0 1 1
4: 2 1 0 0 1
5: 2 1 1 0 2
6: 2 1 1 1 1
7: 2 0 1 1 2
8: 2 1 0 1 2
9: 2 0 1 0 1
However, I do not think that my code is correct.
For example, with Student_ID = 2 :
My code says that "Current_Exam = 1, Prev_Exam = 1, Prev_2_Exam = 0" happens 1 time, but looking at the actual data - this does not happen at all
Can someone please show me what I am doing wrong and how I can correct this?
Note: I think that this should be the expected output:
> expected_output
id current_exam prev_exam prev_2_exam tally
1: 1 0 1 0 2
2: 1 1 0 1 1
3: 1 0 0 1 1
4: 2 1 0 0 1
5: 2 1 1 0 1
6: 2 1 1 1 1
7: 2 0 1 1 2
8: 2 1 0 1 2
9: 2 0 1 0 0
You did not consider that you can not shift the results over id without placing NA.
. <- my_data[order(my_data$id, my_data$result_id),] #sort if needed
.$p1 <- ave(.$results, .$id, FUN = \(x) c(NA, x[-length(x)]))
.$p2 <- ave(.$p1, .$id, FUN = \(x) c(NA, x[-length(x)]))
aggregate(list(tally=.$p1), .[c("id","results", "p1", "p2")], length)
# id results p1 p2 tally
#1 1 0 1 0 2
#2 2 0 1 0 1
#3 2 1 1 0 1
#4 1 0 0 1 1
#5 1 1 0 1 1
#6 2 1 0 1 2
#7 2 0 1 1 2
#8 2 1 1 1 1
.
# id results result_id p1 p2
#1 1 0 1 NA NA
#2 1 1 2 0 NA
#3 1 0 3 1 0
#4 1 1 4 0 1
#5 1 0 5 1 0
#6 1 0 6 0 1
#7 2 1 1 NA NA
#8 2 1 2 1 NA
#9 2 1 3 1 1
#10 2 0 4 1 1
#11 2 1 5 0 1
#12 2 1 6 1 0
#13 2 0 7 1 1
#14 2 1 8 0 1
#15 2 0 9 1 0
An option would be to use filter to indicate those which had passed 3 times in a row.
cbind(., n=ave(.$results, .$id, FUN = \(x) filter(x, c(1,1,1), sides=1)))
# id results result_id n
#1 1 0 1 NA
#2 1 1 2 NA
#3 1 0 3 1
#4 1 1 4 2
#5 1 0 5 1
#6 1 0 6 1
#7 2 1 1 NA
#8 2 1 2 NA
#9 2 1 3 3
#10 2 0 4 2
#11 2 1 5 2
#12 2 1 6 2
#13 2 0 7 2
#14 2 1 8 2
#15 2 0 9 1
If olny the number of times that a student passes an exam, given that the student passed the previous two exams:
sum(ave(.$results, .$id, FUN = \(x) filter(x, c(1,1,1))==3), na.rm=TRUE)
#[1] 1
sum(ave(.$results, .$id, FUN = \(x)
x==1 & c(x[-1], 0) == 1 & c(x[-1:-2], 0, 0) == 1))
#[1] 1
When trying to count events that happen in series, cumsum() comes in quite handy. As opposed to creating multiple lagged variables, this scales well to counts across a larger number of events:
library(tidyverse)
d <- my_data |>
group_by(id) |> # group to cumulate within student only
mutate(
csum = cumsum(results), # cumulative sum of results
i = csum - lag(csum, 3, 0) # substract the cumulative sum from 3 observation before. This gives the number of exams passed in the current and previous 2 observations.
)
# Ungroup to get global count
d |>
ungroup() |>
count(i == 3) # Count the number of cases where the number of exams passes within 3 observations equals 3
#> # A tibble: 2 × 2
#> `i == 3` n
#> <lgl> <int>
#> 1 FALSE 14
#> 2 TRUE 1
# Retaining the group gives counts by student
d |>
count(i == 3) # Count the number of cases where the number of exams passes within 3 observations equals 3
#> # A tibble: 3 × 3
#> # Groups: id [2]
#> id `i == 3` n
#> <dbl> <lgl> <int>
#> 1 1 FALSE 6
#> 2 2 FALSE 8
#> 3 2 TRUE 1
Since you provided the data as data.table, here is how to do the same in that ecosystem:
my_data[ , csum := cumsum(results), .(id)]
my_data[ , i := csum - lag(csum, 3, 0), .(id)]
my_data[ , .(n_cases = sum(i ==3)), id]
#> id n_cases
#> 1: 1 0
#> 2: 2 1
Here's an approach using dplyr. It uses the lag function to look back 1 and 2 results. If the sum together with the current result is 3, then the condition is met. In the example you provided, the condition is only met once
my_data %>%
group_by(id) %>%
mutate(threex = ifelse(results + lag(results,1) + lag(results, 2) == 3, 1, 0)) %>%
filter(!is.na(threex))
id results result_id threex
<dbl> <dbl> <dbl> <dbl>
1 1 0 3 0
2 1 1 4 0
3 1 0 5 0
4 1 0 6 0
5 2 1 3 1
6 2 0 4 0
7 2 1 5 0
8 2 1 6 0
9 2 0 7 0
10 2 1 8 0
11 2 0 9 0
If you then just want to capture the cases when the condition is met, add a filter.
my_data %>%
group_by(id) %>%
mutate(threex = ifelse(results + lag(results,1) + lag(results, 2) == 3, 1, 0)) %>%
filter(threex == 1)
id results result_id threex
<dbl> <dbl> <dbl> <dbl>
1 2 1 3 1
If you are looking to understand how many times the condition is met per id, you can do this.
my_data %>%
group_by(id) %>%
mutate(threex = ifelse(results + lag(results,1) + lag(results, 2) == 3, 1, 0)) %>%
filter(threex == 1) %>%
select(id) %>%
summarize(count = n())
id count
<dbl> <int>
1 2 1

How to count the number of changes in ROW (R)

df <- data.frame(Num1=c(1,0,1,0,1), Num2=c(0,1,1,0,1), Num3=c(1,1,1,1,1), Num4=c(1,1,0,0,1), Num5=c(1,1,1,0,0))
I need to count how many times value changes in each row in r.
Thank you!
rowSums(df[-1] != df[-ncol(df)])
[1] 2 1 2 2 1
ie first row there is a change from 1 to 0 then back to 1. so a total of 2 changes etc
Here is a try ,
df <- data.frame(Num1=c(1,0,1,0,1),
Num2=c(0,1,1,0,1),
Num3=c(1,1,1,1,1),
Num4=c(1,1,0,0,1),
Num5=c(1,1,1,0,0))
df$n_changes <- apply(df , MARGIN = 1 , function(x) sum(abs(diff(x))))
df
#> Num1 Num2 Num3 Num4 Num5 n_changes
#> 1 1 0 1 1 1 2
#> 2 0 1 1 1 1 1
#> 3 1 1 1 0 1 2
#> 4 0 0 1 0 0 2
#> 5 1 1 1 1 0 1
Created on 2022-06-02 by the reprex package (v2.0.1)
We may loop over the row with apply (MARGIN=1), then use rle (run-length-encoding), check the lengths
apply(df, 1, function(x) lengths(rle(x)))[1,]-1
[1] 2 1 2 2 1

Response change analysis in r

I am trying to explore the response change patterns for particular questions. Here is an example of dataset.
id <- c(1,1,1, 2,2,2, 3,3,3,3, 4,4)
item.id <- c(1,1,1, 1,1,1 ,1,1,2,2, 1,1)
sequence <- c(1,2,3, 1,2,3, 1,2,1,2, 1,2)
score <- c(0,0,0, 0,0,1, 0,1,0,0, 1,0)
data <- data.frame("id"=id, "item.id"=item.id, "sequence"=sequence, "score"=score)
data
id item.id sequence score
1 1 1 1 0
2 1 1 2 0
3 1 1 3 0
4 2 1 1 0
5 2 1 2 0
6 2 1 3 1
7 3 1 1 0
8 3 1 2 1
9 3 2 1 0
10 3 2 2 0
11 4 1 1 1
12 4 1 2 0
id represents persons, item.id is for questions. sequence is for the attempt to change the response, and the score is the score of the item.
What I am trying to observe is to subset those whose score were changed from 0 to 1 and 1 to 0.
The desired outputs would be:
data.0.to.1
id item.id sequence score
2 1 1 0
2 1 2 0
2 1 3 1
3 1 1 0
3 1 2 1
data.1.to.0
id item.id sequence score
4 1 1 1
4 1 2 0
Any thoughts? Thanks!
Here is one option by taking the difference of 'score' grouped by 'id', 'item.id'
library(dplyr)
data %>%
group_by(id, item.id) %>%
filter(any(score != 0)) %>%
mutate(ind = c(0, diff(score))) %>%
group_by(ind = ind[ind!=0][1]) %>%
group_split(ind, keep = FALSE)
#[[1]]
# A tibble: 2 x 4
# id item.id sequence score
# <dbl> <dbl> <dbl> <dbl>
#1 4 1 1 1
#2 4 1 2 0
#[[2]]
# A tibble: 5 x 4
# id item.id sequence score
# <dbl> <dbl> <dbl> <dbl>
#1 2 1 1 0
#2 2 1 2 0
#3 2 1 3 1
#4 3 1 1 0
#5 3 1 2 1
I'd do this:
library(dplyr)
data.0.to.1 = data %>%
group_by(id, item.id) %>%
filter(any(diff(score) > 0))
data.1.to.0 = data %>%
group_by(id, item.id) %>%
filter(any(diff(score) < 0))

Change variable value for the first row group_by subject ID using dplyr

More than 2,000 subjects. I would like to change the value for 'time2' to 0 for each first row by subject. For instance, ID=2 subject has 1 for 'time2' at first row of this subject. How to change it to 0, considering 2k subjects?
ID time1 time2
1 0 0
1 0 1
1 1 5
2 0 1
2 1 3
2 3 5
3 ....
With dplyr, we can use ifelse based on a logical condition with row_number()
df2 %>%
group_by(ID) %>%
mutate(time2 = ifelse(row_number()==1, 0, time2))
# A tibble: 6 x 3
# Groups: ID [2]
# ID time1 time2
# <int> <int> <dbl>
#1 1 0 0
#2 1 0 1
#3 1 1 5
#4 2 0 0
#5 2 1 3
#6 2 3 5
Or using data.table, create a row index (.I) grouped by 'ID' and assign (:=) those elements in 'time2' that corresponds to the row index to 0
library(data.table)
setDT(df2)[df2[, .I[seq_len(.N)==1] , ID]$V1, time2 := 0][]
# ID time1 time2
#1: 1 0 0
#2: 1 0 1
#3: 1 1 5
#4: 2 0 0
#5: 2 1 3
#6: 2 3 5
Or a compact base R option would be (assuming that 'ID' is ordered)
df$time2[!duplicated(df$ID)] <- 0
df
# ID time1 time2
#1 1 0 0
#2 1 0 1
#3 1 1 5
#4 2 0 0
#5 2 1 3
#6 2 3 5
You could also use dplyr in combination with replace:
df %>%
dplyr::group_by(ID) %>%
dplyr::mutate(time2 = replace(time2, 1, 0))
# Source: local data frame [6 x 3]
# Groups: ID [2]
#
# ID time1 time2
# <int> <int> <dbl>
# 1 1 0 0
# 2 1 0 1
# 3 1 1 5
# 4 2 0 0
# 5 2 1 3
# 6 2 3 5

Create a sequence conditional on values in other columns in R

I am working with a dataframe similar to the following:
df = data.frame(ID1 = c(2,2,2,2,2,2,2),
ID2 = c(1,1,1,1,1,1,1),
flagTag = c(0,0,0,0,1,0,0))
I need to create a new field "newField" such that the value increments when flagTag = 1 within group of ID1 and ID2 (thus unique records are identify by the combination of ID1 and ID2).The resulting table should look similar
ID1 ID2 flagTag newField
1 2 1 0 1
2 2 1 0 1
3 2 1 0 1
4 2 1 0 1
5 2 1 1 2
6 2 1 0 2
I am trying to do this using dplyr but couldn't come up with a logic to do such manipulation. One way is to go record by record in the dataframe and update "newField" in loop which will be a slow procedure.
Let's use cumsum and mutate:
library(dplyr)
df %>%
group_by(ID1, ID2) %>%
mutate(newField = 1 + cumsum(flagTag))
ID1 ID2 flagTag newField
<dbl> <dbl> <dbl> <dbl>
1 2 1 0 1
2 2 1 0 1
3 2 1 0 1
4 2 1 0 1
5 2 1 1 2
6 2 1 0 2
7 2 1 0 2
Here is a base R option with ave
df$newField <- with(df, ave(flagTag, ID1, ID2, FUN = cumsum)+1)
df$newField
#[1] 1 1 1 1 2 2 2
Or using data.table
library(data.table)
setDT(df)[, newField := cumsum(flagTag) + 1, .(ID1, ID2)]

Resources