Spreading multiple columns over a column in R - r

Sorry because this question has been asked several times, but I'm still having trouble wrapping my head around this problem.
So I have a dataframe, of the form:
ID Val Type
1 10 A
2 11 A
2 10 C
3 10 B
3 12 C
4 9 B
It's not much help but you can use
library(tidyr)
test <- data.frame(ID = c(1,2,2,3,3,4),
Val = c(10,11,10,10,12,9),
Type = c('A', 'A', 'C', 'B', 'C', 'B'))
I would like to split it to obtain:
ID A.Type B.Type C.Type A.Val B.Val C.Val
1 1 0 0 10 0 0
2 1 0 1 11 0 10
3 0 1 1 0 10 12
4 0 0 0 0 9 0
I know how to get columns 1:4 using:
table(test[, c(1, 3)]) %>% as.data.frame() %>% spread(Type, Freq)
It's the last three I need help with because in the actual data-frame values are continuous and table can not be used.

You are trying to reshape your data with multiple value variables where the ones are actually implicit, so in order to get the type_... columns, you will need to create a new type variable with ones and then use dcast from data.table package:
library(data.table)
setDT(test)
dcast(test[, type := 1][], ID ~ Type, value.var = c("type", "Val"),fill = 0)
# ID type_A type_B type_C Val_A Val_B Val_C
# 1: 1 1 0 0 10 0 0
# 2: 2 1 0 1 11 0 10
# 3: 3 0 1 1 0 10 12
# 4: 4 0 1 0 0 9 0
Or you can use reshape from base R, where NA has to be manually replaced:
test$type = 1
reshape(test, idvar = "ID", timevar = "Type", direction = "wide")
# ID Val.A type.A Val.C type.C Val.B type.B
# 1 1 10 1 NA NA NA NA
# 2 2 11 1 10 1 NA NA
# 4 3 NA NA 12 1 10 1
# 6 4 NA NA NA NA 9 1

Related

New variable that indicates the first occurrence of a specific value

I want to create a new variable that indicates the first specific observation of a value for a variable.
In the following example dataset I want to have a new variable "firstna" that is "1" for the first observation of "NA" for this player.
game_data <- data.frame(player = c(1,1,1,1,2,2,2,2), level = c(1,2,3,4,1,2,3,4), points = c(20,NA,NA,NA,20,40,NA,NA))
game_data
player level points
1 1 1 20
2 1 2 NA
3 1 3 NA
4 1 4 NA
5 2 1 20
6 2 2 40
7 2 3 NA
8 2 4 NA
The resulting dataframe should look like this:
game_data_new <- data.frame(player = c(1,1,1,1,2,2,2,2), level = c(1,2,3,4,1,2,3,4), points = c(20,NA,NA,NA,20,40,NA,NA), firstna = c(0,1,0,0,0,0,1,0))
game_data_new
player level points firstna
1 1 1 20 0
2 1 2 NA 1
3 1 3 NA 0
4 1 4 NA 0
5 2 1 20 0
6 2 2 40 0
7 2 3 NA 1
8 2 4 NA 0
To be honest i don't know how to do this. It would be perfect if there is a dplyr option to do so.
A base R solution:
ave(game_data$points, game_data$player,
FUN = function(x) seq_along(x) == match(NA, x, nomatch = 0))
Another ave option to find out first NA by group (player).
game_data$firstna <- ave(game_data$points, game_data$player,
FUN = function(x) cumsum(is.na(x)) == 1)
game_data
# player level points firstna
#1 1 1 20 0
#2 1 2 NA 1
#3 1 3 NA 0
#4 1 4 NA 0
#5 2 1 20 0
#6 2 2 40 0
#7 2 3 NA 1
#8 2 4 NA 0
Here is a solution with data.table:
library("data.table")
game_data <- data.table(player = c(1,1,1,1,2,2,2,2), level = c(1,2,3,4,1,2,3,4), points = c(20,NA,NA,NA,20,40,NA,NA))
game_data[, firstna:=is.na(points) & !is.na(shift(points)), player][]
# > game_data[, firstna:=is.na(points) & !is.na(shift(points)), player][]
# player level points firstna
# 1: 1 1 20 FALSE
# 2: 1 2 NA TRUE
# 3: 1 3 NA FALSE
# 4: 1 4 NA FALSE
# 5: 2 1 20 FALSE
# 6: 2 2 40 FALSE
# 7: 2 3 NA TRUE
# 8: 2 4 NA FALSE
You can do this by grouping by player and then mutating to check if a row has an NA value and the previous row doesn't
game_data %>%
group_by(player) %>%
mutate(firstna = ifelse(is.na(points) & lag(!is.na(points)),1,0)) %>%
ungroup()
Result:
# A tibble: 8 x 4
# Groups: player [2]
player level points firstna
<dbl> <dbl> <dbl> <dbl>
1 1 1 20 0
2 1 2 NA 1
3 1 3 NA 0
4 1 4 NA 0
5 2 1 20 0
6 2 2 40 0
7 2 3 NA 1
8 2 4 NA 0
library(tidyverse)
library(data.table)
data.frame(
player = c(1,1,1,1,2,2,2,2),
level = c(1,2,3,4,1,2,3,4),
points = c(20,NA,NA,NA,20,40,NA,NA)
) -> game_data
game_data_base1 <- game_data
game_data_dt <- data.table(game_data)
microbenchmark::microbenchmark(
better_base = game_data$first_na <- ave(
game_data$points,
game_data$player,
FUN=function(x) seq_along(x)==match(NA,x,nomatch=0)
),
brute_base = do.call(
rbind.data.frame,
lapply(
split(game_data, game_data$player),
function(x) {
x$firstna <- 0
na_loc <- which(is.na(x$points))
if (length(na_loc) > 0) x$firstna[na_loc[1]] <- 1
x
}
)
),
tidy = game_data %>%
group_by(player) %>%
mutate(firstna=as.numeric(is.na(points) & !duplicated(points))) %>%
ungroup(),
dt = game_data_dt[, firstna:=as.integer(is.na(points) & !is.na(shift(points))), player]
)
## Unit: microseconds
## expr min lq mean median uq max neval
## better_base 125.188 156.861 362.9829 191.6385 355.6675 3095.958 100
## brute_base 366.642 450.002 2782.6621 658.0380 1072.6475 174373.974 100
## tidy 998.924 1119.022 2528.3687 1509.0705 2516.9350 42406.778 100
## dt 330.428 421.211 1031.9978 535.8415 1042.1240 9671.991 100
game_data %>%
group_by(player) %>%
mutate(firstna=as.numeric(is.na(points) & !duplicated(points)))
Group by player, then create a boolean vector for cases that are both NA and not duplicates for previous rows.
# A tibble: 8 x 4
# Groups: player [2]
player level points firstna
<dbl> <dbl> <dbl> <dbl>
1 1 1 20 0
2 1 2 NA 1
3 1 3 NA 0
4 1 4 NA 0
5 2 1 20 0
6 2 2 40 0
7 2 3 NA 1
8 2 4 NA 0
If you want the 1s on the last non-NA line before an NA, replace the mutate line with this:
mutate(lastnonNA=as.numeric(!is.na(points) & is.na(lead(points))))
First row of a block of NAs that runs all the way to the end of the player's group:
game_data %>%
group_by(player) %>%
mutate(firstna=as.numeric(is.na(points) & !duplicated(cbind(points,cumsum(!is.na(points))))))
Another way using base:
game_data$firstna <-
unlist(
tapply(game_data$points, game_data$player, function(x) {i<-which(is.na(x))[1];x[]<-0;x[i]<-1;x})
)
or as another ?ave clone:
ave(game_data$points, game_data$player, FUN = function(x) {
i<-which(is.na(x))[1];x[]<-0;x[i]<-1;x
})
An option using diff
transform(game_data, firstna = ave(is.na(points), player, FUN = function(x) c(0,diff(x))))
# player level points firstna
# 1 1 1 20 0
# 2 1 2 NA 1
# 3 1 3 NA 0
# 4 1 4 NA 0
# 5 2 1 20 0
# 6 2 2 40 0
# 7 2 3 NA 1
# 8 2 4 NA 0
And its dplyr equivalent:
library(dplyr)
game_data %>% group_by(player) %>% mutate(firstna = c(0,diff(is.na(points))))
# # A tibble: 8 x 4
# # Groups: player [2]
# player level points firstna
# <dbl> <dbl> <dbl> <dbl>
# 1 1 1 20 0
# 2 1 2 NA 1
# 3 1 3 NA 0
# 4 1 4 NA 0
# 5 2 1 20 0
# 6 2 2 40 0
# 7 2 3 NA 1
# 8 2 4 NA 0

Deleting unnecessary rows after column shuffling in a data frame in R

I have a data frame as below. The Status of each ID recorded in different time points. 0 means the person is alive and 1 means dead.
ID Status
1 0
1 0
1 1
2 0
2 0
2 0
3 0
3 0
3 0
3 1
I want to shuffle the column Status and each ID can have a status of 1, just one time. After that, I want to have NA for other rows. For instance, I want my data frame to look like below after shuffling:
ID Status
1 0
1 0
1 0
2 0
2 1
2 NA
3 0
3 1
3 NA
3 NA
From the data you posted and your example output, it looks like you want to randomly sample df$Status and then do the replacement. To get what you want in one step you could do:
set.seed(3)
df$Status <- ave(sample(df$Status), df$ID, FUN = function(x) replace(x, which(cumsum(x)>=1)[-1], NA))
df
# ID Status
#1 1 0
#2 1 0
#3 1 0
#4 2 1
#5 2 NA
#6 2 NA
#7 3 0
#8 3 0
#9 3 1
#10 3 NA
One option to use cumsum of cumsum to decide first 1 appearing for an ID.
Note that I have modified OP's sample dataframe to represent logic of reshuffling.
library(dplyr)
df %>% group_by(ID) %>%
mutate(Sum = cumsum(cumsum(Status))) %>%
mutate(Status = ifelse(Sum > 1, NA, Status)) %>%
select(-Sum)
# # A tibble: 10 x 2
# # Groups: ID [3]
# ID Status
# <int> <int>
# 1 1 0
# 2 1 0
# 3 1 1
# 4 2 0
# 5 2 1
# 6 2 NA
# 7 3 0
# 8 3 1
# 9 3 NA
# 10 3 NA
Data
df <- read.table(text =
"ID Status
1 0
1 0
1 1
2 0
2 1
2 0
3 0
3 1
3 0
3 0", header = TRUE)

Casting from Long to Wide format, with each repeat creating a new row

I have a data frame in long format, that I'd like to transform into wide format. The data frame has several repeated identifiers that I'd like to treat as unique instances, and represent them as individual rows in the wide data frame.
My question is similar to this one:
Forcing unique values before casting (pivoting) in R
But in the above question the unique entries end up as individual columns. For my question, I would like to have the data put into individual rows. For example:
ID1<-c("A","A","A","A","A","B","B","B","B","B","C","C","C","C","C")
ID2<-c("R","R","R","L","L","R","R","L","L","R","R","L","L","R","R")
Sp<-c("Bird","Cat","Bird","Bird","Dog","Dog","Dog","Cat","Cat","Bird","Cat","Dog","Bird","Bird","Cat")
Count<-c(1,2,2,1,2,1,2,3,2,1,2,3,2,1,5)
DF<-data.frame(ID1,ID2,Sp,Count)
After I cast my data into wide format, I'd like the output data to look like this:
ID1 ID2 Bird Cat Dog
A R 1 2 0
A R 2 0 0 # 2 Birds in the A/ R combination so need second row (don't want to add them together)
A L 1 0 2
B R 1 0 1
B R 0 0 2
B L 0 3 0
B L 0 2 0
C R 1 2 0
C R 0 5 0
C L 2 0 3
If there are no repeats in the unique ID1/ ID2 combination, the cast wouild work as normal. But if there is a repeat, a second (or third or fourth) row would be created.
You can create an auxiliary ID column per group of ID1, ID2 and Sp, and then reshape with ID1, ID2 and AUXID as id columns:
library(dplyr)
DF = DF %>% group_by(ID1, ID2, Sp) %>% mutate(AUXID = row_number()) %>% as.data.frame()
reshape(DF, idvar = c("ID1", "ID2", "AUXID"), timevar = "Sp", dir = "wide")
# ID1 ID2 AUXID Count.Bird Count.Cat Count.Dog
# 1 A R 1 1 2 NA
# 3 A R 2 2 NA NA
# 4 A L 1 1 NA 2
# 6 B R 1 1 NA 1
# 7 B R 2 NA NA 2
# 8 B L 1 NA 3 NA
# 9 B L 2 NA 2 NA
# 11 C R 1 1 2 NA
# 12 C L 1 2 NA 3
# 15 C R 2 NA 5 NA
You can drop the AUXID column and fill NA afterwards.
Here is a data.table version with dcast() which provides a fill parameter to fill NA values:
library(data.table)
(dcast(setDT(DF)[, AUXID := 1:.N, .(ID1, ID2, Sp)],
ID1 + ID2 + AUXID ~ Sp, value.var = "Count", fill = 0)
[, AUXID := NULL][])
# ID1 ID2 Bird Cat Dog
# 1: A L 1 0 2
# 2: A R 1 2 0
# 3: A R 2 0 0
# 4: B L 0 3 0
# 5: B L 0 2 0
# 6: B R 1 0 1
# 7: B R 0 0 2
# 8: C L 2 0 3
# 9: C R 1 2 0
#10: C R 0 5 0

R - sequence calculations both forward and backward looking

I have the following data frame:
id = c("A","A","A","A","A","A","B","B","B","B","B","B","C","C","C","C","C","C")
month = c(1,2,3,4,5,6,1,2,3,4,5,6,1,2,3,4,5,6)
amount = c(0,0,10,0,0,0,0,10,0,10,0,0,0,0,0,10,10,0)
df <- data.frame(id, month, amount)
What I need to do (by ID) is:
Calculate (by way of a negative number) the difference in months between zero and non-zero "amount" rows until such time as the "amount" equals 0. When this happens, the time = 0. THEN, once an "amount" exceeds zero in the sequence, the calculation (by way of a positive number) will look back and calculate the difference in months between non-zero and the historical zero "amount" row.
The solution would look like:
solution = c(-2,-1,0,1,2,3,-1,0,1,0,1,2,-3,-2,-1,0,0,1)
As you can probably tell, its pretty tough to search for this multi-layered problem. Ideally the answer would be using data.table as i'm dealing with millions of rows, but dplyr would also suit my needs.
Any help appreciated.
S.
library(data.table)
setDT(DT)
DT[, g := rleid(id, amount != 0)]
DT[, g_id := g - g[1L], by=id]
DT[, v :=
if (g_id == 0L)
-(.N:1)
else if (g_id %% 2 == 0)
1:.N
else
0L
, by=.(id, g_id)]
all.equal(DT$v, solution) # TRUE
To see how it works:
id month amount g g_id v
1: A 1 0 1 0 -2
2: A 2 0 1 0 -1
3: A 3 10 2 1 0
4: A 4 0 3 2 1
5: A 5 0 3 2 2
6: A 6 0 3 2 3
7: B 1 0 4 0 -1
8: B 2 10 5 1 0
9: B 3 0 6 2 1
10: B 4 10 7 3 0
11: B 5 0 8 4 1
12: B 6 0 8 4 2
13: C 1 0 9 0 -3
14: C 2 0 9 0 -2
15: C 3 0 9 0 -1
16: C 4 10 10 1 0
17: C 5 10 10 1 0
18: C 6 0 11 2 1
You can drop the extra columns with DT[, c("g", "g_id") := NULL].
With tidyr and dplyr
library(dplyr)
library(tidyr)
df_new <- df %>%
group_by(id) %>%
# identify non-zero instances
mutate(temp = ifelse(amount != 0, month, NA)) %>%
# fill down first
fill(temp, .direction = "down") %>%
# fill up after
fill(temp, .direction = "up") %>%
# calculate difference
mutate(solution = month - temp) %>%
# remove temp
select(-temp)
Result
# id month amount solution
# <fctr> <dbl> <dbl> <dbl>
# 1 A 1 0 -2
# 2 A 2 0 -1
# 3 A 3 10 0
# 4 A 4 0 1
# 5 A 5 0 2
# 6 A 6 0 3
# 7 B 1 0 -1
# 8 B 2 10 0
# 9 B 3 0 1
# 10 B 4 10 0
# 11 B 5 0 1
# 12 B 6 0 2
# 13 C 1 0 -3
# 14 C 2 0 -2
# 15 C 3 0 -1
# 16 C 4 10 0
# 17 C 5 10 0
# 18 C 6 0 1

How to subset a flat contingency table in R without losing row & column names?

I'm using ftable to create a flat contingency table. However, when I subset the contingency table, R removes the row and column names. Is there a way to subset the table such that the row and column names remain in the subsetted table? Here's an example:
# Create fake data
Group1 = sample(LETTERS[1:3], 20, replace=TRUE)
Group2 = sample(letters[1:3], 20, replace=TRUE)
Year = sample(c("2010","2011","2012"), 20, replace=TRUE)
df1 = data.frame(Group1, Group2, Year)
# Create flat contingency table with column margin
table1 = ftable(addmargins(table(df1$Group1, df1$Group2, df1$Year), margin=3))
# Select rows with sum greater than 2
table2 = table1[table1[ ,4] > 2, ]
> table1
2010 2011 2012 Sum
A a 0 1 2 3
b 2 1 0 3
c 0 0 0 0
B a 0 1 1 2
b 2 0 0 2
c 1 0 1 2
C a 0 1 0 1
b 1 0 2 3
c 3 0 1 4
> table2
[,1] [,2] [,3] [,4]
[1,] 0 1 2 3
[2,] 2 1 0 3
[3,] 1 0 2 3
[4,] 3 0 1 4
Notice how R has converted the subsetted table to a matrix, stripping out the column names and both levels of row names. How can I keep the ftable structure in the subsetted table?
Consider working with a data.frame of frequencies. It is a much better data structure to work with, especially if you are going to filter it. Here is a way to build one using the reshape package.
# cast the data into a data.frame
library(reshape)
df1$Freq <- 1
df2 <- cast(df1, Group1 + Group2 ~ Year, fun = sum, value = "Freq")
df2
# Group1 Group2 2010 2011 2012
# 1 A a 0 0 1
# 2 A b 1 1 3
# 3 A c 0 0 1
# 4 B a 1 2 0
# 5 B b 1 1 0
# 6 B c 0 0 1
# 7 C a 2 0 1
# 8 C b 2 0 0
# 9 C c 0 0 2
# add a column for the `Sum` of frequencies over the years
df2 <- within(df2, Sum <- `2010` + `2011` + `2012`)
df2
# Group1 Group2 2010 2011 2012 Sum
# 1 A a 0 0 1 1
# 2 A b 1 1 3 5
# 3 A c 0 0 1 1
# 4 B a 1 2 0 3
# 5 B b 1 1 0 2
# 6 B c 0 0 1 1
# 7 C a 2 0 1 3
# 8 C b 2 0 0 2
# 9 C c 0 0 2 2
df2[df2$Sum > 2, ]
# Group1 Group2 2010 2011 2012 Sum
# 2 A b 1 1 3 5
# 4 B a 1 2 0 3
# 7 C a 2 0 1 3
The result will no longer be an ftable object,
because some of the combinations are missing.
But you can have a matrix instead, with rows and column names.
ftable_names <- function(x, which="row.vars") {
# Only tested in dimensions 1 and 2
rows <- as.vector(Reduce(
function(u,v) t(outer(as.vector(u),as.vector(v),paste)),
attr(x, which),
""
))
}
i <- table1[ ,4] > 2
table2 <- table1[i,]
rownames(table2) <- ftable_names(table1, "row.vars")[i]
colnames(table2) <- ftable_names(table1, "col.vars")
table2
# 2010 2011 2012 Sum
# A a 1 2 0 3
# A c 0 0 3 3
# B c 0 3 0 3
# C a 3 1 1 5
ftable creates ‘flat’ contingency tables [by] ... rearranging the data as a [2D] matrix. So, simply use as.matrix to convert the data to a matrix before subsetting (if you use as.table directly, the data returns to it's higher dimension structure).
# Create flat contingency table with column margin and variable names
table1 <- ftable(addmargins(table(Group1 = df1$Group1,
Group2 = df1$Group2,
Year = df1$Year), margin=3))
# Convert to matrix
mat1 <- as.matrix(table1)
mat2 <- mat1[mat1[ ,4] > 2, ]
mat2
> mat2
Year
Group1_Group2 2010 2011 2012 Sum
A_b 3 0 0 3
A_c 0 2 3 5
B_b 2 0 1 3
If you really don't like the "_", then replace using gsub.
dimnames(mat2) <- rapply(dimnames(mat2), gsub, pattern = "_", replacement = " ", how = "replace")
Edit
Or alternatively using the dplyr and tidyr packages for flexibility and readability of code:
library(dplyr)
library(tidyr)
df1 %>%
group_by(Group1, Group2, Year) %>%
tally() %>%
spread(Year, n, fill = 0) %>%
ungroup() %>%
mutate(Sum = rowSums(.[-(1:2)])) %>%
filter(Sum > 2) %>%
unite(Name, c(Group1, Group2), sep = " ")
Source: local data frame [5 x 5]
Name 2010 2011 2012 Sum
(chr) (dbl) (dbl) (dbl) (dbl)
1 A a 2 1 0 3
2 A b 1 1 1 3
3 B b 2 0 2 4
4 B c 1 2 0 3
5 C a 1 2 0 3

Resources