new data set with combining some rows - r

This is not a question about long and wide shape:!!!!! don't make it duplicate plz
Suppose I have :
HouseholdID. PersonID. time. dur. age
1 1 3 4 19
1 2 3 4 29
1 3 5 5 30
1 1 5 5 18
2 1 21 30 18
2 2 21 30 30
In each household some people have the same time and dur. want to combine only rows whose have the same HouseholdID,time and dur
OUTPUT:
HouseholdID. PersonID. time. dur. age. HouseholdID. PersonID. time. dur. age
1 1 3 4 19 1 2 3 4 29
1 3 5 5 30 1 1 5 5 18
2 1 21 30 18 2 2 21 30 30

An option would be dcast from data.table which can take multiple value.var columns
library(data.table)
dcast(setDT(df1), HouseholdID. ~ rowid(HouseholdID.),
value.var = c("PersonID.", "time.", "dur.", "age"), sep="")
# HouseholdID. PersonID.1 PersonID.2 time.1 time.2 dur.1 dur.2 age1 age2
#1: 1 1 2 3 3 4 4 19 29
#2: 2 1 2 21 21 30 30 18 30
Or an option with pivot_wider from the devel version of tidyr
library(tidyr) # ‘0.8.3.9000’
library(dplyr)
df1 %>%
group_by(HouseholdID.) %>%
mutate(rn = row_number()) %>%
pivot_wider(id_cols= HouseholdID., names_from = rn,
values_from = c(PersonID., time., dur., age), name_sep="")
# A tibble: 2 x 9
# HouseholdID. PersonID.1 PersonID.2 time.1 time.2 dur.1 dur.2 age1 age2
# <int> <int> <int> <int> <int> <int> <int> <int> <int>
#1 1 1 2 3 3 4 4 19 29
#2 2 1 2 21 21 30 30 18 30
Update
With the new dataset, extend the id columns by including the 'time.' and 'dur.'
dcast(setDT(df2), HouseholdID. + time. + dur. ~ rowid(HouseholdID., time., dur.),
value.var = c("PersonID.", "age"), sep="")
If we need duplicate columns for 'time.' and 'dur.' (not clear why it is needed though)
dcast(setDT(df2), HouseholdID. + time. + dur. ~ rowid(HouseholdID., time., dur.),
value.var = c("PersonID.", "time.", "dur.", "age"), sep="")[,
c('time.', 'dur.') := NULL][]
# HouseholdID. PersonID.1 PersonID.2 time..11 time..12 dur..11 dur..12 age1 age2
#1: 1 1 2 3 3 4 4 19 29
#2: 1 3 1 5 5 5 5 30 18
#3: 2 1 2 21 21 30 30 18 30
Or with tidyverse
df2 %>%
group_by(HouseholdID., time., dur.) %>%
mutate(rn = row_number()) %>%
pivot_wider(id_cols= c(HouseholdID., time., dur.), names_from = rn,
values_from = c(PersonID., age), names_sep = "")
# A tibble: 3 x 7
# HouseholdID. time. dur. PersonID.1 PersonID.2 age1 age2
# <int> <int> <int> <int> <int> <int> <int>
#1 1 3 4 1 2 19 29
#2 1 5 5 3 1 30 18
#3 2 21 30 1 2 18 30
NOTE: duplicate column names are not recommended as it can lead to confusion in identification of columns.
data
df1 <- structure(list(HouseholdID. = c(1L, 1L, 2L, 2L), PersonID. = c(1L,
2L, 1L, 2L), time. = c(3L, 3L, 21L, 21L), dur. = c(4L, 4L, 30L,
30L), age = c(19L, 29L, 18L, 30L)), class = "data.frame", row.names = c(NA,
-4L))
df2 <- structure(list(HouseholdID. = c(1L, 1L, 1L, 1L, 2L, 2L), PersonID. = c(1L,
2L, 3L, 1L, 1L, 2L), time. = c(3L, 3L, 5L, 5L, 21L, 21L), dur. = c(4L,
4L, 5L, 5L, 30L, 30L), age = c(19L, 29L, 30L, 18L, 18L, 30L)),
class = "data.frame", row.names = c(NA,
-6L))

Related

Find the "top N" in a group and find the average of the "top N" in R

Rank Laps Average Time
1 1 1 30
2 2 1 34
3 3 1 35
4 1 2 32
5 2 2 33
6 3 2 56
7 4 1 43
8 5 1 23
9 6 1 31
10 4 2 23
11 5 2 88
12 6 2 54
I would like to know how I can group ranks 1-3 and ranks 4-6 and get an average of the "average time" for each lap. Also, I would like this to extend if I have groups 7-9, 10-13, etc.
One option is to use cut to put the different ranks into groups, and add Laps as a grouping variable. Then, you can summarize the data to get the mean.
library(tidyverse)
df %>%
group_by(gr = cut(Rank, breaks = seq(0, 6, by = 3)), Laps) %>%
summarize(avg = mean(Average_Time))
Output
gr Laps avg
<fct> <int> <dbl>
1 (0,3] 1 33
2 (0,3] 2 40.3
3 (3,6] 1 32.3
4 (3,6] 2 55
Or another option if you want the range of ranks displayed for the group:
df %>%
group_by(gr = cut(Rank, breaks = seq(0, 6, by = 3))) %>%
mutate(Rank_gr = paste0(min(Rank), "-", max(Rank))) %>%
group_by(Rank_gr, Laps) %>%
summarize(avg = mean(Average_Time))
Output
Rank_gr Laps avg
<chr> <int> <dbl>
1 1-3 1 33
2 1-3 2 40.3
3 4-6 1 32.3
4 4-6 2 55
Since you will have uneven groups, then you might want to use case_when to make the groups:
df %>%
group_by(gr=case_when(Rank %in% 1:3 ~ "1-3",
Rank %in% 4:6 ~ "4-6",
Rank %in% 7:9 ~ "7-9",
Rank %in% 10:13 ~ "10-13"),
Laps) %>%
summarize(avg = mean(Average_Time))
Data
df <- structure(list(Rank = c(1L, 2L, 3L, 1L, 2L, 3L, 4L, 5L, 6L, 4L,
5L, 6L), Laps = c(1L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 2L,
2L), Average_Time = c(30L, 34L, 35L, 32L, 33L, 56L, 43L, 23L,
31L, 23L, 88L, 54L)), class = "data.frame", row.names = c(NA,
-12L))

How to get the difference between groups with a dataframe in long format in R?

Have a simple dataframe with 2 ID's (N = 2) and 2 periods (T = 2), for example:
year id points
1 1 10
1 2 12
2 1 20
2 2 18
How does one achieves the following dataframe (preferably using dplyr or any tidyverse solution)?
id points_difference
1 10
2 6
Notice that the points_difference column is the difference between each ID in across time (namely T2 - T1).
Additionally, how to generalize for multiple columns and multiple ID (with only 2 periods)?
year id points scores
1 1 10 7
1 ... ... ...
1 N 12 8
2 1 20 9
2 ... ... ...
2 N 12 9
id points_difference scores_difference
1 10 2
... ... ...
N 0 1
If you are on dplyr 1.0.0(or higher), summarise can return multiple rows in output so this will also work if you have more than 2 periods. You can do :
library(dplyr)
df %>%
arrange(id, year) %>%
group_by(id) %>%
summarise(across(c(points, scores), diff, .names = '{col}_difference'))
# id points_difference scores_difference
# <int> <int> <int>
#1 1 10 2
#2 1 -7 1
#3 2 6 2
#4 2 -3 3
data
df <- structure(list(year = c(1L, 1L, 2L, 2L, 3L, 3L), id = c(1L, 2L,
1L, 2L, 1L, 2L), points = c(10L, 12L, 20L, 18L, 13L, 15L), scores = c(2L,
3L, 4L, 5L, 5L, 8L)), class = "data.frame", row.names = c(NA, -6L))

Merging two datasets by an ID without adding new columns that say ".x" or ".y"

Suppose I have two datasets. One main dataset, with many columns of metadata, and one new dataset which will be used to fill in some of the gaps in concentrations in the main dataset:
Main dataset:
study_id timepoint age occupation concentration1 concentration2
1 1 21 0 3 7
1 2 21 0 4 6
1 3 22 0 NA NA
1 4 22 0 NA NA
2 1 36 3 0 4
2 2 36 3 2 11
2 3 37 3 NA NA
2 4 37 3 NA NA
New data set to merge:
study_id timepoint concentration1 concentration2
1 3 11 20
1 4 21 35
2 3 7 17
2 4 14 25
Whenever I merge by "study_id" and "timepoint", I get two new columns that are "concentration1.y" and "concentration2.y" while the original columns get renamed as "concentration1.x" and "concentration2.x". I don't want this.
This is what I want:
study_id timepoint age occupation concentration1 concentration2
1 1 21 0 3 7
1 2 21 0 4 6
1 3 22 0 11 20
1 4 22 0 21 35
2 1 36 3 0 4
2 2 36 3 2 11
2 3 37 3 7 17
2 4 37 3 14 25
In other words, I want to merge by "study_id" and "timepoint" AND merge the two concentration columns so the data are within the same columns. Please note that both datasets do not have identical columns (dataset 1 has 1000 columns with metadata while dataset2 just has study id, timepoint, and concentration columns that match the concentration columns in dataset1).
Thanks so much in advance.
Using coalesce is one option (from dplyr package). This still adds the two columns for concentration 1 and 2 from the second data frame. These would be removed after NA filled in.
library(tidyverse)
df1 %>%
left_join(df2, by = c("study_id", "timepoint")) %>%
mutate(concentration1 = coalesce(concentration1.x, concentration1.y),
concentration2 = coalesce(concentration2.x, concentration2.y)) %>%
select(-concentration1.x, -concentration1.y, -concentration2.x, -concentration2.y)
Or to generalize with multiple concentration columns:
df1 %>%
left_join(df2, by = c("study_id", "timepoint")) %>%
split.default(str_remove(names(.), "\\.x|\\.y")) %>%
map_df(reduce, coalesce)
Edit: To prevent the resultant column names from being alphabetized from split.default, you can add an intermediate step of sorting the list based on the first data frame's column name order.
df3 <- df1 %>%
left_join(df2, by = c("study_id", "timepoint")) %>%
split.default(str_remove(names(.), "\\.x|\\.y"))
df3[names(df1)] %>%
map_df(reduce, coalesce)
Output
study_id timepoint age occupation concentration1 concentration2
1 1 1 21 0 3 7
2 1 2 21 0 4 6
3 1 3 22 0 11 20
4 1 4 22 0 21 35
5 2 1 36 3 0 4
6 2 2 36 3 2 11
7 2 3 37 3 7 17
8 2 4 37 3 14 25
Data
df1 <- structure(list(study_id = c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L),
timepoint = c(1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L), age = c(21L,
21L, 22L, 22L, 36L, 36L, 37L, 37L), occupation = c(0L, 0L,
0L, 0L, 3L, 3L, 3L, 3L), concentration1 = c(3L, 4L, NA, NA,
0L, 2L, NA, NA), concentration2 = c(7L, 6L, NA, NA, 4L, 11L,
NA, NA)), class = "data.frame", row.names = c(NA, -8L))
df2 <- structure(list(study_id = c(1L, 1L, 2L, 2L), timepoint = c(3L,
4L, 3L, 4L), concentration1 = c(11L, 21L, 7L, 14L), concentration2 = c(20L,
35L, 17L, 25L)), class = "data.frame", row.names = c(NA, -4L))

Rounding error when grouping by multiple categories

Why are the values for SE_daily wrong? I expected it to round to the nearest integer (though I wanted a decimal), instead the decimal answer is completely wrong. What did I miss?
csv<-csv%>%group_by(id_num)%>%group_by(Month)%>%group_by(Day)%>%mutate(SE_daily=mean(SelfEsteem, na.rm=T))
head(csv[,c(1:5,28,181)])
> head(csv[,c(1:5,28,181)])
Source: local data frame [6 x 7]
Groups: Day [3]
X.1 X id_num Month Day SelfEsteem SE_daily
<int> <int> <int> <int> <int> <int> <dbl>
1 1 1 29 2 19 4 3.457944 #mean(4,4,3)= 4, expected answer= 3.66666666667
2 2 2 29 2 19 4 3.457944
3 3 3 29 2 19 3 3.457944
4 4 4 29 2 20 4 3.424242 #expected answer= 4
5 5 5 29 2 21 4 3.318182 #expected answer=4
6 6 6 29 2 21 4 3.318182
head of csv output:
structure(list(X.1 = 1:6, X = 1:6,
id_num = c(29L, 29L, 29L, 29L, 29L, 29L),
Month = c(2L, 2L, 2L, 2L, 2L, 2L),
Day = c(19L, 19L, 19L, 20L, 21L, 21L),
SelfEsteem = c(4L, 4L, 3L, 4L, 4L, 4L),
SE_daily = c(3.45794392523365, 3.45794392523365, 3.45794392523365, 3.42424242424242, 3.31818181818182, 3.31818181818182)),
.Names = c("X.1", "X", "id_num", "Month", "Day", "SelfEsteem", "SE_daily"),
row.names = c(NA, -6L),
class = "data.frame")
I got the expected output for SE_daily. It's possible that by piping the group_by commands instead of putting them in a single command you are looking at multiple id_num and Months that share a common Day (assuming that the provided data structure is only a subset of the entire data set)
library(dplyr)
csv %>%
group_by(id_num, Month, Day) %>%
mutate(SE_daily=mean(SelfEsteem, na.rm=TRUE))
output
Source: local data frame [6 x 7]
Groups: id_num, Month, Day [3]
X.1 X id_num Month Day SelfEsteem SE_daily
<int> <int> <int> <int> <int> <int> <dbl>
1 1 1 29 2 19 4 3.666667
2 2 2 29 2 19 4 3.666667
3 3 3 29 2 19 3 3.666667
4 4 4 29 2 20 4 4.000000
5 5 5 29 2 21 4 4.000000
6 6 6 29 2 21 4 4.000000

Subset of data with criteria of two columns

I would like to create a subset of data that consists of Units that have a higher score in QTR 4 than QTR 1 (upward trend). Doesn't matter if QTR 2 or 3 are present.
Unit QTR Score
5 4 34
1 1 22
5 3 67
2 4 78
3 2 39
5 2 34
1 2 34
5 1 67
1 3 70
1 4 89
3 4 19
Subset would be:
Unit QTR Score
1 1 22
1 2 34
1 3 70
1 4 89
I've tried variants of something like this:
upward_subset <- subset(mydata,Unit if QTR=4~Score > QTR=1~Score)
Thank you for your time
If the dataframe is named "d", then this succeeds on your test set:
d[ which(d$Unit %in%
(sapply( split(d, d["Unit"]),
function(dd) dd[dd$QTR ==4, "Score"] - dd[dd$QTR ==1, "Score"]) > 0)) ,
]
#-------------
Unit QTR Score
2 1 1 22
7 1 2 34
9 1 3 70
10 1 4 89
An alternative in two steps:
result <- unlist(
by(
test,
test$Unit,
function(x) x$Score[x$QTR==4] > x$Score[x$QTR==2])
)
test[test$Unit %in% names(result[result==TRUE]),]
Unit QTR Score
2 1 1 22
7 1 2 34
9 1 3 70
10 1 4 89
A solution using data.table (Probably there are better versions than what I have at the moment).
Note: Assuming a QTR value for a given Unit is unique
Data:
df <- structure(list(Unit = c(5L, 1L, 5L, 2L, 3L, 5L, 1L, 5L, 1L, 1L,
3L), QTR = c(4L, 1L, 3L, 4L, 2L, 2L, 2L, 1L, 3L, 4L, 4L), Score = c(34L,
22L, 67L, 78L, 39L, 34L, 34L, 67L, 70L, 89L, 19L)), .Names = c("Unit",
"QTR", "Score"), class = "data.frame", row.names = c(NA, -11L
))
Solution:
dt <- data.table(df, key=c("Unit", "QTR"))
dt[, Score[Score[QTR == 4] > Score[QTR == 1]], by=Unit]
Unit V1
1: 1 22
2: 1 34
3: 1 70
4: 1 89

Resources