Restructuing and formatting data frame columns - r

dfin <-
ID SEQ GRP C1 C2 C3 T1 T2 T3
1 1 1 0 5 8 0 1 2
1 2 1 5 10 15 5 6 7
2 1 2 20 25 30 0 1 2
C1 is the concentration (CONC) at T1 (TIME) and so on. This is what I want as an output:
dfout <-
ID SEQ GRP CONC TIME
1 1 1 0 0
1 1 1 5 1
1 1 1 8 2
1 2 1 5 5
1 2 1 10 6
1 2 1 15 7
2 1 2 20 0
2 1 2 25 1
2 1 2 30 2
The dfin has much more columns for Cx and Tx where x is the number of concentration readings.

You can do this with data.table::melt, with its capability of melting the table into multiple columns based on the columns pattern:
library(data.table)
melt(
setDT(df),
id.vars=c("ID", "SEQ", "GRP"),
# columns starts with C and T should be melted into two separate columns
measure.vars=patterns("^C", "^T"),
value.name=c('CONC', 'TIME')
)[order(ID, SEQ)][, variable := NULL][]
# ID SEQ GRP CONC TIME
#1: 1 1 1 0 0
#2: 1 1 1 5 1
#3: 1 1 1 8 2
#4: 1 2 1 5 5
#5: 1 2 1 10 6
#6: 1 2 1 15 7
#7: 2 1 2 20 0
#8: 2 1 2 25 1
#9: 2 1 2 30 2
Or if the value column names follow the pattern [CT][0-9], you can use reshape from base R by specifying the sep="" which will split the value columns name by the letter/digit separation due to this default setting (from ?reshape):
split = if (sep == "") {
list(regexp = "[A-Za-z][0-9]", include = TRUE)
} else {
list(regexp = sep, include = FALSE, fixed = TRUE)}
reshape(df, varying=-(1:3), idvar=c("ID", "SEQ", "GRP"),
dir="long", sep="", v.names=c("CONC", "TIME"))
# ID SEQ GRP time CONC TIME
#1: 1 1 1 1 0 5
#2: 1 2 1 1 5 10
#3: 2 1 2 1 20 25
#4: 1 1 1 2 8 0
#5: 1 2 1 2 15 5
#6: 2 1 2 2 30 0
#7: 1 1 1 3 1 2
#8: 1 2 1 3 6 7
#9: 2 1 2 3 1 2

Related

Sample from different data.frame

data1=data.frame("Group1" = sample(1:2,100,r=T),
"Group2" = sample(c('a','b'),100,r=T),
"V1" = sample(1:3, 100, r=T),
"V2" = sample(0:1, 100, r=T),
"V3" = sample(1:5, 100, r=T),
"V4" = sample(1:2, 100, r=T))
data2=data.frame("Group1"=c(1,1,2,2),
"Group2"=c('a','b','a','b'),
"Size"=c(9,7,6,10),
"V1"=c(NA),
"V2"=c(NA),
"V3"=c(NA),
"V4"=c(NA))
I have 'data1' that contains my data. Then I have 'data2' which has 'Group1' and 'Group2' and 'Size'.
What I wish for is to group my data by ('Group1' and 'Group2') and take a random sample of size 'Size' from 'data1' to fill in V1-V4 in data2.
The hopeful output would look like this but with the NA values filled in based on 'data1'
library(dplyr);library(tidyr)
data3= data2 %>%
uncount(Size)
library(data.table)
setDT(data1)
setDT(data2)
# sample indices from each group
i <-
data2[data1, on = .(Group1, Group2)
][, .(i_samp = sample(.I, Size)), by = .(Group1, Group2, Size)
][, i_samp]
# subset to sampled indices
merge(data1[i], data2[, .(Group1, Group2, Size)])
# Group1 Group2 V1 V2 V3 V4 Size
# 1: 1 a 3 1 2 2 9
# 2: 1 a 3 1 5 1 9
# 3: 1 a 2 1 4 2 9
# 4: 1 a 3 1 1 1 9
# 5: 1 a 3 1 4 1 9
# 6: 1 a 1 0 3 1 9
# 7: 1 a 3 1 1 1 9
# 8: 1 a 1 1 1 2 9
# 9: 1 a 2 0 2 1 9
# 10: 1 b 2 0 5 2 7
# 11: 1 b 3 0 5 2 7
# 12: 1 b 3 1 4 2 7
# 13: 1 b 1 1 1 1 7
# 14: 1 b 1 1 4 1 7
# 15: 1 b 1 0 1 1 7
# 16: 1 b 1 0 3 1 7
# 17: 2 a 2 0 5 1 6
# 18: 2 a 1 0 5 1 6
# 19: 2 a 3 1 1 2 6
# 20: 2 a 1 0 2 1 6
# 21: 2 a 3 1 1 2 6
# 22: 2 a 1 1 3 2 6
# 23: 2 b 3 0 2 1 10
# 24: 2 b 2 1 5 1 10
# 25: 2 b 3 0 1 1 10
# 26: 2 b 3 1 2 1 10
# 27: 2 b 2 0 5 1 10
# 28: 2 b 2 0 2 1 10
# 29: 2 b 2 0 2 2 10
# 30: 2 b 1 0 1 1 10
# 31: 2 b 3 0 5 1 10
# 32: 2 b 3 0 5 1 10
# Group1 Group2 V1 V2 V3 V4 Size
Input data used:
data1=data.frame("Group1" = sample(1:2,100,r=T),
"Group2" = sample(c('a','b'),100,r=T),
"V1" = sample(1:3, 100, r=T),
"V2" = sample(0:1, 100, r=T),
"V3" = sample(1:5, 100, r=T),
"V4" = sample(1:2, 100, r=T))
data2=data.frame("Group1"=c(1,1,2,2),
"Group2"=c('a','b','a','b'),
"Size"=c(9,7,6,10),
"V1"=c(NA),
"V2"=c(NA),
"V3"=c(NA),
"V4"=c(NA))
Here's a more parameterized version where you explicitly set the columns you want to fill and the keys connecting the two tables
fill_key <- c('Group1', 'Group2')
columns_to_fill <- paste0('V', 1:4)
# sample indices from each group
i <-
data2[data1, on = (fill_key)
][, .(i_samp = sample(.I, Size)), by = c(fill_key, 'Size')
][, i_samp]
# subset to sampled indices
merge(data1[i, c(fill_key, columns_to_fill), with = FALSE],
data2[, c(fill_key, 'Size'), with = FALSE])
One dplyr option could be:
data1 %>%
left_join(data2 %>%
select(-starts_with("V"))) %>%
group_by(Group1, Group2) %>%
sample_n(Size)
Group1 Group2 V1 V2 V3 V4 Size
<dbl> <fct> <int> <int> <int> <int> <dbl>
1 1 a 1 1 1 2 9
2 1 a 3 0 3 2 9
3 1 a 2 0 3 2 9
4 1 a 1 1 2 1 9
5 1 a 2 1 2 2 9
6 1 a 2 0 5 2 9
7 1 a 1 0 1 2 9
8 1 a 3 0 5 2 9
9 1 a 1 0 5 1 9
10 1 b 2 0 1 1 7
11 1 b 2 1 3 1 7
12 1 b 3 1 4 2 7
13 1 b 1 1 1 1 7
14 1 b 2 1 2 2 7
15 1 b 1 1 1 2 7
16 1 b 1 1 2 1 7
17 2 a 3 1 5 1 6
18 2 a 1 0 5 1 6
19 2 a 1 0 1 1 6
20 2 a 2 0 5 1 6
21 2 a 3 0 1 1 6
22 2 a 2 1 4 1 6
23 2 b 3 0 2 1 10
24 2 b 1 1 5 1 10
25 2 b 3 1 1 1 10
26 2 b 3 1 4 1 10
27 2 b 1 0 4 2 10
28 2 b 3 1 1 2 10
29 2 b 2 1 4 1 10
30 2 b 1 0 1 1 10
31 2 b 2 1 4 2 10
32 2 b 2 1 5 2 10

Count by group with condition (in R data.table)

Consider a dataset that consists of ID and Val.
# dataset
ID Val Counter
1 2 1
1 4 2
1 NA 2
1 13 3
1 12 4
2 NA 0
2 33 1
2 5 2
2 5 3
A counter per subgroup can be added by dt[, normal_counter := 1:.N, by=ID]. I am looking for a counter that is not incremented when there is an NAvalue (see counter in example above).
This is a cumulative sum of non-NA values by group, so:
dat[, cntr := cumsum(!is.na(Val)), by=ID]
dat
# ID Val Counter cntr
#1: 1 2 1 1
#2: 1 4 2 2
#3: 1 NA 2 2
#4: 1 13 3 3
#5: 1 12 4 4
#6: 2 NA 0 0
#7: 2 33 1 1
#8: 2 5 2 2
#9: 2 5 3 3

Delete rows with value if not only value in group

Somewhat new to R and I find myself needing to delete rows based on multiple criteria. The data frame has 3 columns and I need to delete rows where bid=99 and there are values less than 99 grouping by rid and qid. The desired output at an rid and qid level are bid has multiple values less than 99 or bid=99.
rid qid bid
1 1 5
1 1 6
1 1 99
1 2 6
2 1 7
2 1 99
2 2 2
2 2 3
3 1 7
3 1 8
3 2 1
3 2 99
4 1 2
4 1 6
4 2 1
4 2 2
4 2 99
5 1 99
5 2 99
The expected output...
rid qid bid
1 1 5
1 1 6
1 2 6
2 1 7
2 2 2
2 2 3
3 1 7
3 1 8
3 2 1
4 1 2
4 1 6
4 2 1
4 2 2
5 1 99
5 2 99
Any assistance would be appreciated.
You can use the base R function ave to generate a dropping variable like this:
df$dropper <- with(df, ave(bid, rid, qid, FUN= function(i) i == 99 & length(i) > 1))
ave calculates a function on bid, grouping by rid and qid. The function tests if each element of the grouped bid values i is 99 and if i has a length greater than 1. Also, with is used to reduce typing.
which returns
df
rid qid bid dropper
1 1 1 5 0
2 1 1 6 0
3 1 1 99 1
4 1 2 6 0
5 2 1 7 0
6 2 1 99 1
7 2 2 2 0
8 2 2 3 0
9 3 1 7 0
10 3 1 8 0
11 3 2 1 0
12 3 2 99 1
13 4 1 2 0
14 4 1 6 0
15 4 2 1 0
16 4 2 2 0
17 4 2 99 1
18 5 1 99 0
19 5 2 99 0
then drop the undesired observations with df[dropper == 0, 1:3] which will simultaneously drop the new variable.
If you want to just delete rows where bid = 99 then use dplyr.
library(dplyr)
df <- df %>%
filter(bid != 99)
Where df is your data frame. and != means not equal to
Updated solution using dplyr
df %>%
group_by(rid, qid) %>%
mutate(tempcount = n())%>%
ungroup() %>%
mutate(DropValue =ifelse(bid == 99 & tempcount > 1, 1,0) ) %>%
filter(DropValue == 0) %>%
select(rid,qid,bid)
Here is another option with all and if condition in data.table to subset the rows after grouping by 'rid' and 'qid'
library(data.table)
setDT(df1)[, if(all(bid==99)) .SD else .SD[bid!= 99], .(rid, qid)]
# rid qid bid
# 1: 1 1 5
# 2: 1 1 6
# 3: 1 2 6
# 4: 2 1 7
# 5: 2 2 2
# 6: 2 2 3
# 7: 3 1 7
# 8: 3 1 8
# 9: 3 2 1
#10: 4 1 2
#11: 4 1 6
#12: 4 2 1
#13: 4 2 2
#14: 5 1 99
#15: 5 2 99
Or without using the if
setDT(df1)[df1[, .I[all(bid==99) | bid != 99], .(rid, qid)]$V1]
Here is a solution using dplyr, which is a very expressive framework for this kind of problems.
df <- read.table(text =
" rid qid bid
1 1 5
1 1 6
1 1 99
1 2 6
2 1 7
2 1 99
2 2 2
2 2 3
3 1 7
3 1 8
3 2 1
3 2 99
4 1 2
4 1 6
4 2 1
4 2 2
4 2 99
5 1 99
5 2 99",
header = TRUE, stringsAsFactors = FALSE)
Dplyr verbs allow to express the program in a way that is close to the very terms of your questions:
library(dplyr)
res <-
df %>%
group_by(rid, qid) %>%
filter(!(any(bid < 99) & bid == 99)) %>%
ungroup()
# # A tibble: 15 × 3
# rid qid bid
# <int> <int> <int>
# 1 1 1 5
# 2 1 1 6
# 3 1 2 6
# 4 2 1 7
# 5 2 2 2
# 6 2 2 3
# 7 3 1 7
# 8 3 1 8
# 9 3 2 1
# 10 4 1 2
# 11 4 1 6
# 12 4 2 1
# 13 4 2 2
# 14 5 1 99
# 15 5 2 99
Let's check we get the desired output:
desired_output <- read.table(text =
" rid qid bid
1 1 5
1 1 6
1 2 6
2 1 7
2 2 2
2 2 3
3 1 7
3 1 8
3 2 1
4 1 2
4 1 6
4 2 1
4 2 2
5 1 99
5 2 99",
header = TRUE, stringsAsFactors = FALSE)
identical(as.data.frame(res), desired_output)
# [1] TRUE

Indexing by multiple criteria between 2 data frames in R

I have the following data frame:
id day event
1 1 1
1 3 1
2 1 0
2 4 0
2 9 0
2 15 0
3 2 0
3 5 0
4 1 1
4 8 1
4 11 1
What i want is when an event has a value zero then all the event values become one except from the last one(by date). So the output should be the following:
id day event
1 1 1
1 3 1
2 1 1
2 4 1
2 9 1
2 15 0
3 2 1
3 5 0
4 1 1
4 8 1
4 11 1
Any help?
We could use data.table. Convert the 'data.frame' to 'data.table' (setDT(df1)), grouped by 'id', if any of the 'event' is 0 (!event) for that particular 'id', we replicate 1 for the length of that group -1 (.N-1) and concatenate with 0 or else to return the 'event' value, assign (:=) to update the 'event' column.
library(data.table)
setDT(df1)[, event :=if(any(!event)) c(rep(1L, .N-1),0L) else event, by = id]
df1
# id day event
# 1: 1 1 1
# 2: 1 3 1
# 3: 2 1 1
# 4: 2 4 1
# 5: 2 9 1
# 6: 2 15 0
# 7: 3 2 1
# 8: 3 5 0
# 9: 4 1 1
#10: 4 8 1
#11: 4 11 1
Or using dplyr, we group by 'id' and change the 'event' column by taking the lead of the logical vector that is replicated and add with another logical vector (all(event)).
library(dplyr)
df1 %>%
group_by(id) %>%
mutate(event= lead(rep(any(!event), n()), default=0) + all(event))
# id day event
# (int) (int) (dbl)
#1 1 1 1
#2 1 3 1
#3 2 1 1
#4 2 4 1
#5 2 9 1
#6 2 15 0
#7 3 2 1
#8 3 5 0
#9 4 1 1
#10 4 8 1
#11 4 11 1

Extracting event rows from a data frame

I have this data frame:
df <-
ID var TIME value method
1 3 0 2 1
1 3 2 2 1
1 3 3 0 1
1 4 0 10 1
1 4 2 10 1
1 4 4 5 1
1 4 6 5 1
2 3 0 2 1
2 3 2 2 1
2 3 3 0 1
2 4 0 10 1
2 4 2 10 1
2 4 4 5 1
2 4 6 5 1
I want to extract rows that has a new eventin value column. For example, for ID=1, var=3 has a value of 2 at TIME=0. This value stays the same at TIME=1, so I would take the first row at TIME=0 only and discard the second row. However, the third row, the value for var=3 has changed into zero, so I have also to extract this row. And so on for the rest of the variables. This has to be applied for every subject ID. For the above df, the result should be as follows:
dfevent <-
ID var TIME value method
1 3 0 2 1
1 3 3 0 1
1 4 0 10 1
1 4 4 5 1
2 3 0 2 1
2 3 3 0 1
2 4 0 10 1
2 4 4 5 1
Could any one help me doing this in R? I have a huge data set and I want to extract the information at which a new event has occurred for the value of every var. I have 4 variables in the data frame numbered (3, 4,5,6, and 7). The above is an example for 2 variables (variable number: 3 and 4).
This does it using dplyr
library(dplyr)
df %>%
group_by(ID, var) %>%
mutate(tf = ifelse(value==lag(value), 1, 0)) %>%
filter(is.na(tf) | tf==0) %>%
select(-tf)
# ID var TIME value method
#1 1 3 0 2 1
#2 1 3 3 0 1
#3 1 4 0 10 1
#4 1 4 4 5 1
#5 2 3 0 2 1
#6 2 3 3 0 1
#7 2 4 0 10 1
#8 2 4 4 5 1
basically, I created an extra variable that returns a '1' when the value is the same as the preceding row within groups of unique ID/var combinations. We then get rid of this variable before returning the output.
Base solution:
df[with(df, abs(ave(value,ID,FUN=function(x) c(1,diff(x)) ))) > 0,]
# ID var TIME value method
#1 1 3 0 2 1
#3 1 3 3 0 1
#4 1 4 0 10 1
#6 1 4 4 5 1
#8 2 3 0 2 1
#10 2 3 3 0 1
#11 2 4 0 10 1
#13 2 4 4 5 1
From the expected results, you may also try rleid from data.table
library(data.table)#data.table_1.9.5
setDT(df)[df[, .I[1L] , list(ID, var, rleid(value))]$V1]
# ID var TIME value method
#1: 1 3 0 2 1
#2: 1 3 3 0 1
#3: 1 4 0 10 1
#4: 1 4 4 5 1
#5: 2 3 0 2 1
#6: 2 3 3 0 1
#7: 2 4 0 10 1
#8: 2 4 4 5 1
Or a similar approach as #thelatemail
setDT(df)[df[, .I[abs(c(1,diff(value)))>0] , ID]$V1]
Or
unique(setDT(df)[, id:=rleid(value)], by=c('ID', 'var', 'id'))

Resources