Related
I want to reshape the original dataframe into the target dataframe as follows.
But first, to recreate dataframes:
original <- data.frame(caseid = c("id101", 'id201', 'id202', 'id301', 'id302'),
age_child1 = c('3', '5', '8', NA, NA),
age_child2 = c('1', '7', NA, NA, NA),
age_child3 = c('2', '6', '8', '3', NA))
target <- data.frame(caseid = c('id101_1', 'id101_2', 'id101_3', 'id201_1', 'id201_2', 'id201_3', 'id202_1', 'id202_3', 'id301_3'),
age = c(3, 1, 2, 5, 7, 6, 8, 8, 3))
The caseid column represents mothers. I want to create a new caseid row per each of the children and add the respective 'age' value to the age column. If no 'age' value is available, it means there is not an n child and no new row should be created.
Thanks for the help!
You can use pivot_longer() and its various helpful options:
pivot_longer(original, cols = starts_with("age"), names_prefix = "age_child",values_to = "age",values_transform = as.integer) %>%
filter(!is.na(age)) %>%
mutate(caseid = paste0(caseid,"_",name)) %>%
select(-name)
Output:
# A tibble: 9 × 2
caseid age
<chr> <int>
1 id101_1 3
2 id101_2 1
3 id101_3 2
4 id201_1 5
5 id201_2 7
6 id201_3 6
7 id202_1 8
8 id202_3 8
9 id301_3 3
Using reshape form base r ,
original <- data.frame(caseid = c("id101", 'id201', 'id202', 'id301', 'id302'),
age_child1 = c('3', '5', '8', NA, NA),
age_child2 = c('1', '7', NA, NA, NA),
age_child3 = c('2', '6', '8', '3', NA))
a <- reshape(original , varying = c("age_child1" , "age_child2" , "age_child3") ,
direction = "long" ,
times = c("_1" , "_2" , "_3") ,
v.names = "age")
a$caseid <- paste0(a$caseid , a$time)
a <- a[order(a$caseid) , ][c("caseid" , "age")]
a <- na.omit(a)
row.names(a) <- NULL
a
#> caseid age
#> 1 id101_1 3
#> 2 id101_2 1
#> 3 id101_3 2
#> 4 id201_1 5
#> 5 id201_2 7
#> 6 id201_3 6
#> 7 id202_1 8
#> 8 id202_3 8
#> 9 id301_3 3
Created on 2022-06-01 by the reprex package (v2.0.1)
original %>%
pivot_longer(-caseid, names_to = 'child', names_pattern = '([0-9]+$)',
values_to = 'age', values_drop_na = TRUE)%>%
unite(caseid, caseid, child)
# A tibble: 9 x 2
caseid age
<chr> <chr>
1 id101_1 3
2 id101_2 1
3 id101_3 2
4 id201_1 5
5 id201_2 7
6 id201_3 6
7 id202_1 8
8 id202_3 8
9 id301_3 3
It's been two days since I'm trying to find this :
I have a dataframe with more than 2 mil observations with this structure
id = c(1,2,3,4,5,6,7,8,9,10,11,12)
group = c(1,1,1,1,2,2,2,2,3,3,3,3)
sex = c('M','F', 'M', 'M', 'M', 'F', 'F', 'F', 'M', 'M', 'M', 'F')
time = c(10, 11, 11.5, 13, 13.2, 15, 9, 7.4, 18, 12, 34.5, 21)
I would like to find for each female the male with the closest time and this by group
By example here id = 2 is a female in the group 1 with time = 11 the closest male in the group 1 is id = 3
ect for each female in each group
I tried to use something like this
keep <- function(x){
a <- df[which.min(abs(df[which(df[,'sex'] == "M"),'time']-x[,'time'])),]
return(a)
}
apply(df, 1, keep)
But it does not work.
If someone can help me it would be great.
Are you after something like below?
setDT(df)[
,
c(
.SD[sex == "F"],
.(closestM_id = id[sex == "M"][max.col(-abs(outer(
time[sex == "F"],
time[sex == "M"], "-"
)))])
), group
]
which gives
group id sex time closestM_id
1: 1 2 F 11.0 3
2: 2 6 F 15.0 5
3: 2 7 F 9.0 5
4: 2 8 F 7.4 5
5: 3 12 F 21.0 9
Data
> dput(df)
structure(list(id = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12),
group = c(1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3), sex = c("M",
"F", "M", "M", "M", "F", "F", "F", "M", "M", "M", "F"), time = c(10,
11, 11.5, 13, 13.2, 15, 9, 7.4, 18, 12, 34.5, 21)), class = "data.frame", row.names = c(NA,
-12L))
data.table solution using a rolling join to nearest time.
Using the df from Thomas' answer
setDT(df)
df[sex=="F",][,closestM_id := df[sex=="M",][df[sex=="F",],
x.id,
on = .(group, time), roll = "nearest"]]
# id group sex time closestM_id
# 1: 2 1 F 11.0 3
# 2: 6 2 F 15.0 5
# 3: 7 2 F 9.0 5
# 4: 8 2 F 7.4 5
# 5: 12 3 F 21.0 9
You could split the data.frame() into groups of males and females then use outer() to find the absolute difference in time of all combinations.
Code:
lapply(split(df, df[, "group"]), function(x){
# split by sex
tmp1 <- split(x, x[, "sex"])
# time difference for every combination
tmp2 <- abs(t(outer(tmp1[["M"]][, "time"], tmp1[["F"]][, "time"], "-")))
# find minimum for each woman (rowwise minimum)
# and connect those numbers with original ID in input data.frame
tmp3 <- tmp1[["M"]][apply(tmp2, 1, which.min), ]
# ronames to represent female ID
rownames(tmp3) <- tmp1[["F"]][, "id"]
# return
tmp3
})
# $`1`
# id group sex time
# 2 3 1 M 11.5
#
# $`2`
# id group sex time
# 6 5 2 M 13.2
# 7 5 2 M 13.2
# 8 5 2 M 13.2
#
# $`3`
# id group sex time
# 12 9 3 M 18
Now each group has its own data.frame(). The rownames() represent the ID of the woman and the respective row of the man in the data.frame() with the smallest absolute difference in time.
Data
df <- data.frame(id = c(1,2,3,4,5,6,7,8,9,10,11,12),
group = c(1,1,1,1,2,2,2,2,3,3,3,3),
sex = c('M','F', 'M', 'M', 'M', 'F', 'F', 'F', 'M', 'M', 'M', 'F'),
time = c(10, 11, 11.5, 13, 13.2, 15, 9, 7.4, 18, 12, 34.5, 21))
Restructuring the data would help. Create a separate data frame for each sex, create a third data set with all unique pairings of males and females, then merge and subset to narrow it down to the desired pairs. expand.grid is very handy for computing these sorts of combinations, after that, dplyr functions can be used to handle the rest of the logic.
library(dplyr)
# create one data set for females
females <- df %>%
filter(sex == "F") %>%
select(f_id = id, f_time = time, f_group = group)
# create one data set for males
males <- df %>%
filter(sex == "M") %>%
select(m_id = id, m_time = time, m_group = group)
# All possible pairings of males and females
pairs <- expand.grid(f_id = females %>% pull(f_id),
m_id = males %>% pull(m_id),
stringsAsFactors = FALSE)
# Merge in information about each individual
pairs <- pairs %>%
left_join(females, by = "f_id") %>%
left_join(males, by = "m_id") %>%
# eliminate any pairings that are in different groups
filter(f_group == m_group)
pairs
Result, potential pairs
f_id m_id f_time f_group m_time m_group
1 2 1 11.0 1 10.0 1
2 2 3 11.0 1 11.5 1
3 2 4 11.0 1 13.0 1
4 6 5 15.0 2 13.2 2
5 7 5 9.0 2 13.2 2
6 8 5 7.4 2 13.2 2
7 12 9 21.0 3 18.0 3
8 12 10 21.0 3 12.0 3
9 12 11 21.0 3 34.5 3
# compute distances and
# subset for the closest male to each female
pairs %>%
mutate(diff = abs(m_time - f_time)) %>%
group_by(f_id) %>%
filter(diff == min(diff)) %>%
select(m_id, f_id)
Output, the closest pairs
# A tibble: 5 x 2
# Groups: f_id [5]
m_id f_id
<dbl> <dbl>
1 3 2
2 5 6
3 5 7
4 5 8
5 9 12
Follow-up question to Dynamically create value labels with haven::labelled, where akrun provided a good answer using deframe.
I am using haven::labelled to set value labels of a variable. The goal is to create a fully documented dataset I can export to SPSS.
Now, say I have a df value_labels of values and their value labels. I also have a df df_data with variables to which I want allocate value labels.
value_labels <- tibble(
value = c(seq(1:6), seq(1:3), NA),
labels = c(paste0("value", 1:6),paste0("value", 1:3), NA),
name = c(rep("var1", 6), rep("var2", 3), "var3")
)
df_data <- tibble(
id = 1:10,
var1 = floor(runif(10, 1, 7)),
var2 = floor(runif(10, 1, 4)),
var3 = rep("string", 10)
)
Manually, I would create value labels for df_data$var1 and df_data$var2 like so:
df_data$var1 <- haven::labelled(df_data$var, labels = c(values1 = 1, values2 = 2, values3 = 3, values4 = 4, values5 = 5, values6 = 6))
df_data$var2 <- haven::labelled(df_data$var, labels = c(values1 = 1, values2 = 2, values3 = 3))
I need a more dynamic way of assigning correct value labels to the correct variable in a large dataset. The solution also needs to ignore character vectors, since I dont want these to have value labels. For that reason, var3 in value_labels is listed as NA.
The solution does not need to work with multiple datasets in a list.
Here is one option where we split the named 'value/labels' by 'name' after removing the NA rows, use the names of the list to subset the columns of 'df_data', apply the labelled and assign it to back to the same columns
lbls2 <- na.omit(value_labels)
lstLbls <- with(lbls2, split(setNames(value, labels), name))
df_data[names(lstLbls)] <- Map(haven::labelled,
df_data[names(lstLbls)], labels = lstLbls)
df_data
# A tibble: 10 x 4
# id var1 var2 var3
# <int> <dbl+lbl> <dbl+lbl> <chr>
# 1 1 2 [value2] 2 [value2] string
# 2 2 5 [value5] 2 [value2] string
# 3 3 4 [value4] 1 [value1] string
# 4 4 1 [value1] 2 [value2] string
# 5 5 1 [value1] 1 [value1] string
# 6 6 6 [value6] 2 [value2] string
# 7 7 1 [value1] 3 [value3] string
# 8 8 1 [value1] 1 [value1] string
# 9 9 3 [value3] 3 [value3] string
#10 10 6 [value6] 1 [value1] string
My aim is to replace NA's in a spark data frame using the Last Observation Carried Forward method. I wrote the following code and works. However, it seems to take longer than expected for a larger dataset.
It would be great if someone can recommend a better approach or improve the code.
Example and Code with Sparklyr
In the following example, NA's are replaced after ordering them using the
time and grouping them by grp.
df_with_nas <- data.frame(time = seq(as.Date('2001/01/01'),
as.Date('2010/01/01'), length.out = 10),
grp = c(rep(1, 5), rep(2, 5)),
v1 = c(1, rep(NA, 3), 5, rep(NA, 5)),
v2 = c(NA, NA, 3, rep(NA, 4), 3, NA, NA))
tbl <- copy_to(sc, df_with_nas, overwrite = TRUE)
tbl %>%
spark_apply(function(df) {
library(dplyr)
na_locf <- function(x) {
v <- !is.na(x)
c(NA, x[v])[cumsum(v) + 1]
}
df %>% arrange(time) %>% group_by(grp) %>% mutate_at(vars(-v1, -grp),
funs(na_locf(.)))
})
# # Source: spark<?> [?? x 4]
# time grp v1 v2
# <dbl> <dbl> <dbl> <dbl>
# 1 11323 1 1 NaN
# 2 11688. 1 NaN NaN
# 3 12053. 1 NaN 3
# 4 12419. 1 NaN 3
# 5 12784. 1 5 3
# 6 13149. 2 NaN NaN
# 7 13514. 2 NaN NaN
# 8 13880. 2 NaN 3
# 9 14245. 2 NaN 3
# 10 14610 2 NaN 3
data.table
Following approach with data.table works quite fast for the data I have. I am expecting the size of the data to increase soon, and then I may have to rely on sparklyr.
library(data.table)
setDT(df_with_nas)
df_with_nas <- df_with_nas[order(time)]
cols <- c("v1", "v2")
df_with_nas[, (cols) := zoo::na.locf(.SD, na.rm = FALSE),
by = grp, .SDcols = cols]
I did this sort of loop, is quite slow...
df_with_nas = df_with_nas %>% mutate(row = 1:nrow(df_with_nas))
for(n in 1:50){
df_with_nas = df_with_nas %>%
arrange(row) %>%
mutate_all(~if_else(is.na(.),lag(.,1),.))
}
run until no NA
then
collect(df_with_nas)
Will run the code.
You can leverage the spark_apply() function and run the na.locf function in each of your cluster nodes.
Install R runtimes on each of your cluster nodes.
Install the zoo R package on each nodes as well.
Run spark apply this way:
data_filled <- spark_apply(data_with_holes, function(df) zoo:na.locf(df))
You can do this quite quickly using sql with the added benefit that you can easily apply LOCF on grouped basis. The pattern you want to use is LAST_VALUE(column, true) OVER (window) - this searches over the window for the most recent column value which is not NA (passing "true" to LAST_VALUE sets ignore NA = true). Since you want to look backwards from the current value the window should be
ORDER BY time
ROWS BETWEEN UNBOUNDED PRECEDING AND -1 FOLLOWING
Of course, if the first value in the group is NA it will remain NA.
library(sparklyr)
library(dplyr)
sc <- spark_connect(master = "local")
test_table <- data.frame(
v1 = c(1, 2, NA, 3, NA, 5, NA, 6, NA),
v2 = c(1, 1, 1, 1, 1, 2, 2, 2, 2),
time = c(1, 2, 3, 4, 5, 2, 1, 3, 4)
) %>%
sdf_copy_to(sc, ., "test_table")
spark_session(sc) %>%
sparklyr::invoke("sql", "SELECT *, LAST_VALUE(v1, true)
OVER (PARTITION BY v2
ORDER BY time
ROWS BETWEEN UNBOUNDED PRECEDING AND -1 FOLLOWING)
AS last_non_na
FROM test_table") %>%
sdf_register() %>%
mutate(v1 = ifelse(is.na(v1), last_non_na, v1))
#> # Source: spark<?> [?? x 4]
#> v1 v2 time last_non_na
#> <dbl> <dbl> <dbl> <dbl>
#> 1 1 1 1 NaN
#> 2 2 1 2 1
#> 3 2 1 3 2
#> 4 3 1 4 2
#> 5 3 1 5 3
#> 6 NaN 2 1 NaN
#> 7 5 2 2 NaN
#> 8 6 2 3 5
#> 9 6 2 4 6
Created on 2019-08-27 by the reprex package (v0.3.0)
I am newish to R and having trouble with a for loop over unique values.
with the df:
id = c(1,2,2,3,3,4)
rank = c(1,2,1,3,3,4)
df = data.frame(id, rank)
I run:
df$dg <- logical(6)
for(i in unique(df$id)){
ifelse(!unique(df$rank), df$dg ==T, df$dg == F)
}
I am trying to mark the $dg variable as T providing that rank is different for each unique id and F if rank is the same within each id.
I am not getting any errors, but I am only getting F for all values of $dg even though I should be getting a mix.
I have also used the following loop with the same results:
for(i in unique(df$id)){
ifelse(length(unique(df$rank)), df$dg ==T, df$dg == F)
}
I have read other similar posts but the advice has not worked for my case.
From Comments:
I want to mark dg TRUE for all instances of an id if rank changed at all for a given id. Im looking to say for a given ID which has anywhere between 1-13 instances, mark dg TRUE if rank differs across instances.
Update: How to identify groups (ids) that only have one rank?
After clarification that OP provided this would be a solution for this particular case:
library(dplyr)
df %>%
group_by(id) %>%
mutate(dg = ifelse( length(unique(rank))>1 | n() == 1, T, F))
For another data-set that has also an id, which has duplicates but also non-duplicate rank (presented below) this would be the output:
df2 %>%
group_by(id) %>%
mutate(dg = ifelse( length(unique(rank))>1 | n() == 1, T, F))
#:OUTPUT:
# Source: local data frame [9 x 3]
# Groups: id [5]
#
# # A tibble: 9 x 3
# id rank dg
# <dbl> <dbl> <lgl>
# 1 1 1 TRUE
# 2 2 2 TRUE
# 3 2 1 TRUE
# 4 3 3 FALSE
# 5 3 3 FALSE
# 6 4 4 TRUE
# 7 5 1 TRUE
# 8 5 1 TRUE
# 9 5 3 TRUE
Data-no-2:
df2 <- structure(list(id = c(1, 2, 2, 3, 3, 4, 5, 5, 5), rank = c(1, 2, 1, 3, 3, 4, 1, 1, 3
)), .Names = c("id", "rank"), row.names = c(NA, -9L), class = "data.frame")
How to identify duplicated rows within each group (id)?
You can use dplyr package:
library(dplyr)
df %>%
group_by(id, rank) %>%
mutate(dg = ifelse(n() > 1, F,T))
This will give you:
# Source: local data frame [6 x 3]
# Groups: id, rank [5]
#
# # A tibble: 6 x 3
# id rank dg
# <dbl> <dbl> <lgl>
# 1 1 1 TRUE
# 2 2 2 TRUE
# 3 2 1 TRUE
# 4 3 3 FALSE
# 5 3 3 FALSE
# 6 4 4 TRUE
Note: You can simply convert it back to a data.frame().
A data.table solution would be:
dt <- data.table(df)
dt$dg <- ifelse(dt[ , dg := .N, by = list(id, rank)]$dg>1,F,T)
Data:
df <- structure(list(id = c(1, 2, 2, 3, 3, 4), rank = c(1, 2, 1, 3,
3, 4)), .Names = c("id", "rank"), row.names = c(NA, -6L), class = "data.frame")
# > df
# id rank
# 1 1 1
# 2 2 2
# 3 2 1
# 4 3 3
# 5 3 3
# 6 4 4
N. B. Unless you want a different identifier rather than TRUE/FALSE, using ifelse() is redundant and costs computationally. #DavidArenburg