I have two columns of data like this:
I want to add a column or modify the second column resulting in a sequence of integers starting with 1, wherever the 1 already appears. Result changes to:
I can do this with a loop, but what is the "right" R way of doing it?
Here's my loop:
for(i in 1:length(df2$col2)) {
df2$col3[i] <- ifelse(df2$col2[i] == 1, 1, df2$col3[i - 1] + 1)
if(is.na(df2$col2[i])) df2$col3[i] <- df2$col3[i - 1] + 1
}
Here is a sample data set with 20 rows:
478.69, 320.45, 503.7, 609.3, 478.19, 419.633683050051, 552.939975773916,
785.119385505095, 18.2542654918507, 98.6469651805237, 132.587260054424,
697.119552921504, 512.560374778695, 916.425200179219, 14.3385051051155
), col2 = c(1, NA, 1, NA, NA, 1, NA, 1, NA, NA, NA, NA, 1, NA,
NA, NA, NA, NA, NA, NA)), class = "data.frame", row.names = c(NA,
-20L))
I don't know if this is the way to do it, but it's one way:
df$col3 <- unlist(sapply(diff(c(which(!is.na(df$col2)), nrow(df) + 1)), seq))
df
#> col1 col2 col3
#> 1 478.69000 1 1
#> 2 320.45000 NA 2
#> 3 503.70000 1 1
#> 4 609.30000 NA 2
#> 5 478.19000 NA 3
#> 6 478.69000 1 1
#> 7 320.45000 NA 2
#> 8 503.70000 1 1
#> 9 609.30000 NA 2
#> 10 478.19000 NA 3
#> 11 419.63368 NA 4
#> 12 552.93998 NA 5
#> 13 785.11939 1 1
#> 14 18.25427 NA 2
#> 15 98.64697 NA 3
#> 16 132.58726 NA 4
#> 17 697.11955 NA 5
#> 18 512.56037 NA 6
#> 19 916.42520 NA 7
#> 20 14.33851 NA 8
Note that the first 5 values of col1 were missing from your dput, so I added the second 5 numbers twice - they're not relevant to the question anyway.
Data
df <- structure(list(col1 = c(478.69, 320.45, 503.7, 609.3, 478.19,
478.69, 320.45, 503.7, 609.3, 478.19, 419.633683050051, 552.939975773916,
785.119385505095, 18.2542654918507, 98.6469651805237, 132.587260054424,
697.119552921504, 512.560374778695, 916.425200179219, 14.3385051051155
), col2 = c(1, NA, 1, NA, NA, 1, NA, 1, NA, NA, NA, NA, 1, NA,
NA, NA, NA, NA, NA, NA)), class = "data.frame", row.names = c(NA,
-20L))
df
#> col1 col2
#> 1 478.69000 1
#> 2 320.45000 NA
#> 3 503.70000 1
#> 4 609.30000 NA
#> 5 478.19000 NA
#> 6 478.69000 1
#> 7 320.45000 NA
#> 8 503.70000 1
#> 9 609.30000 NA
#> 10 478.19000 NA
#> 11 419.63368 NA
#> 12 552.93998 NA
#> 13 785.11939 1
#> 14 18.25427 NA
#> 15 98.64697 NA
#> 16 132.58726 NA
#> 17 697.11955 NA
#> 18 512.56037 NA
#> 19 916.42520 NA
#> 20 14.33851 NA
Related
I have a large dataset in which the answers to one question are distributed among various columns. However, if the columns belong together, they share the same prefix. I wonder how I can create a subset dataset of each question sorting based on the prefix.
Here is an example dataset. I would like to receive an efficient and easy adaptable solution to create a dataset only containing the values of either question one, two or three.
structure(list(ID = c(1, 2, 3, 4, 5, 6, 7, 8), Question1a = c(1,
1, NA, NA, 1, 1, 1, NA), Question1b = c(NA, 1, NA, 1, NA, 1,
NA, 1), Question1c = c(1, 1, NA, NA, 1, NA, NA, NA), Question2a = c(1,
NA, NA, NA, 1, 1, NA, NA), Question2b = c(NA, 1, NA, 1, NA, NA,
NA, NA), Question3a = c(NA, NA, NA, NA, 1, 1, 1, NA), Question3b = c(NA,
NA, 1, 1, NA, NA, NA, NA)), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -8L))
You can use sapply and a function:
list_data <- sapply(c("Question1", "Question2", "Question3"),
function(x) df[startsWith(names(df),x)], simplify = FALSE)
This will store everything in a list. To get the individual data sets in the global environment as individual objects, use:
list2env(list_data, globalenv())
Output
# $Question1
# # A tibble: 8 × 3
# Question1a Question1b Question1c
# <dbl> <dbl> <dbl>
# 1 1 NA 1
# 2 1 1 1
# 3 NA NA NA
# 4 NA 1 NA
# 5 1 NA 1
# 6 1 1 NA
# 7 1 NA NA
# 8 NA 1 NA
#
# $Question2
# # A tibble: 8 × 2
# Question2a Question2b
# <dbl> <dbl>
# 1 1 NA
# 2 NA 1
# 3 NA NA
# 4 NA 1
# 5 1 NA
# 6 1 NA
# 7 NA NA
# 8 NA NA
#
# $Question3
# # A tibble: 8 × 2
# Question3a Question3b
# <dbl> <dbl>
# 1 NA NA
# 2 NA NA
# 3 NA 1
# 4 NA 1
# 5 1 NA
# 6 1 NA
# 7 1 NA
# 8 NA NA
I believe the underlying question is about data-formats.
Here's a few:
library(tidyverse)
structure(
list(
ID = c(1, 2, 3, 4, 5, 6, 7, 8),
Question1a = c(1,
1, NA, NA, 1, 1, 1, NA),
Question1b = c(NA, 1, NA, 1, NA, 1,
NA, 1),
Question1c = c(1, 1, NA, NA, 1, NA, NA, NA),
Question2a = c(1,
NA, NA, NA, 1, 1, NA, NA),
Question2b = c(NA, 1, NA, 1, NA, NA,
NA, NA),
Question3a = c(NA, NA, NA, NA, 1, 1, 1, NA),
Question3b = c(NA,
NA, 1, 1, NA, NA, NA, NA)
),
class = c("tbl_df", "tbl", "data.frame"),
row.names = c(NA, -8L)
) -> square_df
square_df %>%
pivot_longer(-ID,
names_to = c("Question", "Item"),
names_pattern = "Question(\\d+)(\\w+)") ->
long_df
long_df
#> # A tibble: 56 × 4
#> ID Question Item value
#> <dbl> <chr> <chr> <dbl>
#> 1 1 1 a 1
#> 2 1 1 b NA
#> 3 1 1 c 1
#> 4 1 2 a 1
#> 5 1 2 b NA
#> 6 1 3 a NA
#> 7 1 3 b NA
#> 8 2 1 a 1
#> 9 2 1 b 1
#> 10 2 1 c 1
#> # … with 46 more rows
long_df %>%
na.omit(value) ->
sparse_long_df
sparse_long_df
#> # A tibble: 22 × 4
#> ID Question Item value
#> <dbl> <chr> <chr> <dbl>
#> 1 1 1 a 1
#> 2 1 1 c 1
#> 3 1 2 a 1
#> 4 2 1 a 1
#> 5 2 1 b 1
#> 6 2 1 c 1
#> 7 2 2 b 1
#> 8 3 3 b 1
#> 9 4 1 b 1
#> 10 4 2 b 1
#> # … with 12 more rows
sparse_long_df %>%
nest(data = c(ID, Item, value)) ->
nested_long_df
nested_long_df
#> # A tibble: 3 × 2
#> Question data
#> <chr> <list>
#> 1 1 <tibble [12 × 3]>
#> 2 2 <tibble [5 × 3]>
#> 3 3 <tibble [5 × 3]>
Created on 2022-05-12 by the reprex package (v2.0.1)
You could also use map to store each dataframe in a list, e.g.
library(purrr)
# 3 = number of questions
map(c(1:3),
function(x){
quest <- paste0("Question",x)
select(df, ID, starts_with(quest))
})
Output:
[[1]]
# A tibble: 8 x 4
ID Question1a Question1b Question1c
<dbl> <dbl> <dbl> <dbl>
1 1 1 NA 1
2 2 1 1 1
3 3 NA NA NA
4 4 NA 1 NA
5 5 1 NA 1
6 6 1 1 NA
7 7 1 NA NA
8 8 NA 1 NA
[[2]]
# A tibble: 8 x 3
ID Question2a Question2b
<dbl> <dbl> <dbl>
1 1 1 NA
2 2 NA 1
3 3 NA NA
4 4 NA 1
5 5 1 NA
6 6 1 NA
7 7 NA NA
8 8 NA NA
[[3]]
# A tibble: 8 x 3
ID Question3a Question3b
<dbl> <dbl> <dbl>
1 1 NA NA
2 2 NA NA
3 3 NA 1
4 4 NA 1
5 5 1 NA
6 6 1 NA
7 7 1 NA
8 8 NA NA
I found a really intuitive solution using the dplyr package, using the select and starts_with commands. Alternatively, you can also replace the starts_with command with contains, if the you are not identifying the similar variables by a prefix but some other common feature.
Q1 <- Survey %>%
select(
starts_with("Question1")
)
Q2 <- Survey %>%
select(
starts_with("Question2")
)
Q3 <- Survey %>%
select(
starts_with("Question3")
)
Say I have the following dataframe:
ABC1_old <- c(1, 5, 3, 4, 3, NA, NA, NA, NA, NA)
ABC2_old <- c(4, 2, 1, 1, 5, NA, NA, NA, NA, NA)
ABC1_adj <- c(NA, NA, NA, NA, NA, 5, 5, 1, 2, 4)
ABC2_adj <- c(NA, NA, NA, NA, NA, 3, 2, 1, 4, 2)
df <- data.frame(ABC1_old, ABC2_old, ABC1_adj, ABC2_adj)
I want to create a column that compares each pair of ABCn_old with its corresponding ABCn_adj. (So ABC1_old would be compared against ABCn_adj, etc.) The resulting column would be called ABCn_new. The evaluation would be that if ABCn_old is NA, fill in the blank with the corresponding value in ABCn_adj, otherwise use ABCn_old's value. The new columns would look like this:
df$ABC1_new <- c(1, 5, 3, 4, 3, 5, 5, 1, 2, 4)
df$ABC2_new <- c(4, 2, 1, 1, 5, 3, 2, 1, 4, 2)
I know a simple mutate could work here, but I would like to use some kind of tidyverse looping via purrr if possible since the dataset is much larger in reality. Any ideas for the best way to achieve this?
map_dfc(split.default(df, str_remove(names(df), "_.*")), ~coalesce(!!!.x))
# A tibble: 10 x 2
ABC1 ABC2
<dbl> <dbl>
1 1 4
2 5 2
3 3 1
4 4 1
5 3 5
6 5 3
7 5 2
8 1 1
9 2 4
10 4 2
Putting it together:
df %>%
split.default(str_replace(names(.), "_.*", "_new")) %>%
map_dfc(~coalesce(!!!.x))%>%
cbind(df, .)
ABC1_old ABC2_old ABC1_adj ABC2_adj ABC1_new ABC2_new
1 1 4 NA NA 1 4
2 5 2 NA NA 5 2
3 3 1 NA NA 3 1
4 4 1 NA NA 4 1
5 3 5 NA NA 3 5
6 NA NA 5 3 5 3
7 NA NA 5 2 5 2
8 NA NA 1 1 1 1
9 NA NA 2 4 2 4
10 NA NA 4 2 4 2
Using tidyverse
library(dplyr)
library(tidyr)
library(stringr)
df %>%
mutate(rn = row_number()) %>%
pivot_longer(cols = -rn, names_to = c(".value", 'grp'),
names_sep = '_', values_drop_na = TRUE) %>%
select(-grp, -rn) %>%
rename_all(~ str_c(., '_new')) %>% bind_cols(df, .)
# ABC1_old ABC2_old ABC1_adj ABC2_adj ABC1_new ABC2_new
#1 1 4 NA NA 1 4
#2 5 2 NA NA 5 2
#3 3 1 NA NA 3 1
#4 4 1 NA NA 4 1
#5 3 5 NA NA 3 5
#6 NA NA 5 3 5 3
#7 NA NA 5 2 5 2
#8 NA NA 1 1 1 1
#9 NA NA 2 4 2 4
#10 NA NA 4 2 4 2
Or using dplyr
df %>%
mutate(across(ends_with('old'),
~ coalesce(., get(str_replace(cur_column(),
'old', 'adj'))), .names = '{.col}_new'))
I have a package on github to solve this and similar problems. In this case we could use dplyover::across2 to apply one (or more) functions to two set of columns, which can be selected with tidyselect. In the .names argument we can specify "{pre}" to refer to the common prefix of both sets of columns.
library(dplyr)
library(dplyover) # https://github.com/TimTeaFan/dplyover
df %>%
mutate(across2(ends_with("_old"),
ends_with("_adj"),
~ coalesce(.x, .y),
.names = "{pre}_new"))
#> ABC1_old ABC2_old ABC1_adj ABC2_adj ABC1_new ABC2_new
#> 1 1 4 NA NA 1 4
#> 2 5 2 NA NA 5 2
#> 3 3 1 NA NA 3 1
#> 4 4 1 NA NA 4 1
#> 5 3 5 NA NA 3 5
#> 6 NA NA 5 3 5 3
#> 7 NA NA 5 2 5 2
#> 8 NA NA 1 1 1 1
#> 9 NA NA 2 4 2 4
#> 10 NA NA 4 2 4 2
Created on 2021-05-16 by the reprex package (v0.3.0)
I have a dataframe of a service. Now I need to add a column "order" and group them with the following rule:
Group the service to orders: If within the next 5 values after one service Value "A" is another service "A" present, fill all values to an order ID - also the ones that don't have a service value. If there is no service value within the next 5 values the next order group is defined.
dput(data)
structure(list(id = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
14, 15, 16), time = structure(1:15, .Label = c("13:20:01", "13:20:02",
"13:20:03", "13:20:04", "13:20:05", "13:20:06", "13:20:07", "13:20:08",
"13:20:09", "13:20:10", "13:20:11", "13:20:12", "13:20:13", "13:20:14",
"13:20:15"), class = "factor"), apples = c(2, 2, 2, 3, 3, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2), service = structure(c(NA, 1L, 1L,
NA, 1L, NA, 1L, NA, NA, NA, NA, NA, 1L, NA, 1L), .Label = "A", class = "factor")), class = "data.frame", row.names = c(NA,
-15L))
overview
id time apples service
1 13:20:01 2
2 13:20:02 2 A
3 13:20:03 2 A
4 13:20:04 3
5 13:20:05 3 A
6 13:20:06 2
7 13:20:07 2 A
8 13:20:08 2
9 13:20:09 2
10 13:20:10 2
11 13:20:11 2
12 13:20:12 2
14 13:20:13 2 A
15 13:20:14 2
16 13:20:15 2 A
This is the format I'm looking for. ID 2 to ID 8 is an order and ID 14 to ID 16.
id time apples service Order
1 13:20:01 2
2 13:20:02 2 A 1
3 13:20:03 2 A 1
4 13:20:04 3 1
5 13:20:05 3 A 1
6 13:20:06 2 1
7 13:20:07 2 A 1
8 13:20:08 2
9 13:20:09 2
10 13:20:10 2
11 13:20:11 2
12 13:20:12 2
14 13:20:13 2 A 2
15 13:20:14 2 2
16 13:20:15 2 A 2
I tried it with a for loop. I suggest there is a way to use the mutate method and add the "range" conditon.
Thx for your help!
This is my output that is produced by the code of tspano
# A tibble: 15 x 11
id time apples service start end g0 g1 g2 g3 order
<dbl> <fct> <dbl> <fct> <dbl> <dbl> <chr> <int> <chr> <int> <int>
1 1 13:20:01 2 NA 0 3 NA 0 NA 0 NA
2 2 13:20:02 2 A 1 3 start 1 NA 0 NA
3 3 13:20:03 2 A 2 3 NA 1 NA 0 NA
4 4 13:20:04 3 NA 2 2 NA 1 NA 0 NA
5 5 13:20:05 3 A 3 2 NA 1 NA 0 NA
6 6 13:20:06 2 NA 3 1 NA 1 NA 0 NA
7 7 13:20:07 2 A 3 1 NA 1 NA 0 NA
8 8 13:20:08 2 NA 2 0 end 2 NA 0 NA
9 9 13:20:09 2 NA 2 1 NA 2 NA 0 NA
10 10 13:20:10 2 NA 1 1 NA 2 NA 0 NA
11 11 13:20:11 2 NA 1 2 NA 2 NA 0 NA
12 12 13:20:12 2 NA 0 2 NA 2 NA 0 NA
13 14 13:20:13 2 A 1 2 start 3 NA 0 NA
14 15 13:20:14 2 NA 1 1 NA 3 NA 0 NA
15 16 13:20:15 2 A 2 1 NA 3 NA 0 NA
Here is a solution using RcppRoll, which should be faster than a R for loop:
data %>%
mutate(start = RcppRoll::roll_sum(c(rep(F,4),(service=="A") %in% T), n = 5, align = "right"),
end = RcppRoll::roll_sum(c((service=="A") %in% T, rep(F,4)), n = 5, align = "left"),
g0 = case_when(start>0 & (lag(start)==0) %in% c(T,NA) ~ "start",
end ==0 ~ "end",
T ~ NA_character_)
) %>%
group_by(g1 = cumsum(!is.na(g0))) %>%
mutate(g2 = if_else(first(g0)=="end", NA_character_, "order")) %>%
ungroup() %>%
group_by(g3 = cumsum(!is.na(g2) & is.na(lag(g2))) ) %>%
mutate(order = if_else(is.na(g2), NA_integer_, g3)) %>%
ungroup() %>%
select(id, time, apples, service, order)
If you remove the last select you can see I have several intermediate results that should make the logic clear.
I have a survey where people rank some schools. The survey results in multiple columns with 1s and 2s. I need to colapse these into a persons first and second choice.
Here is an exampe of the data that I have.
df1 <- tibble(Person = c(1 , 2 , 3 , 4 , 5 , 6 , 7),
School1 = c(NA, 1 , 2 , NA, NA, NA, 1 ),
School2 = c(NA, 2 , 1 , NA, NA, 1 , NA),
School3 = c(1 , NA, NA, NA, NA, 2 , NA),
School4 = c(2 , NA, NA, 1 , 2 , NA, NA),
School5 = c(NA, NA, NA, 2 , 1 , NA, 2))
Person School1 School2 School3 School4 School5
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 NA NA 1 2 NA
2 2 1 2 NA NA NA
3 3 2 1 NA NA NA
4 4 NA NA NA 1 2
5 5 NA NA NA 2 1
6 6 NA 1 2 NA NA
7 7 1 NA NA NA 2
Here is the result that I need.
df2 <- tibble(Person = c(1 , 2 , 3 , 4 , 5 , 6 , 7),
School1 = c(NA, 1 , 2 , NA, NA, NA, 1 ),
School2 = c(NA, 2 , 1 , NA, NA, 1 , NA),
School3 = c(1 , NA, NA, NA, NA, 2 , NA),
School4 = c(2 , NA, NA, 1 , 2 , NA, NA),
School5 = c(NA, NA, NA, 2 , 1 , NA, 2),
Firstchoice = c('School3', 'School1', 'School2', 'School4', 'School5', 'School2', 'School1'),
Secondchoice = c('School4', 'School2', 'School1', 'School5', 'School4', 'School3', 'School5'))
Person School1 School2 School3 School4 School5 Firstchoice Secondchoice
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
1 1 NA NA 1 2 NA School3 School4
2 2 1 2 NA NA NA School1 School2
3 3 2 1 NA NA NA School2 School1
4 4 NA NA NA 1 2 School4 School5
5 5 NA NA NA 2 1 School5 School4
6 6 NA 1 2 NA NA School2 School3
7 7 1 NA NA NA 2 School1 School5
I have looked at mutate, and using a for loop however I can't figure out how to get them to work since they would need to do inline updates of a column.
Any help would be appreciated.
One tidyverse possibility could be:
df1 %>%
gather(var, val, -Person) %>%
mutate(val = ifelse(val == 1, "Firstchoice",
ifelse(val == 2, "Secondchoice", NA_character_))) %>%
na.omit() %>%
spread(val, var) %>%
left_join(df1, by = c("Person" = "Person"))
Person Firstchoice Secondchoice School1 School2 School3 School4 School5
<dbl> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 School3 School4 NA NA 1 2 NA
2 2 School1 School2 1 2 NA NA NA
3 3 School2 School1 2 1 NA NA NA
4 4 School4 School5 NA NA NA 1 2
5 5 School5 School4 NA NA NA 2 1
6 6 School2 School3 NA 1 2 NA NA
7 7 School1 School5 1 NA NA NA 2
Piece of cake with tidyr:
choices <- gather(df1, key = "school", value = "choice", -Person, na.rm=TRUE)
choices <- arrange(choices, Person, choice)
I am trying to fill out all NA's excluding the first two NA's for cols 1 and 4 and three NA's for cols 2 and 3 with most recent non-NA value . Here is my data and code:
hh<-structure(list(ka = c(NA, NA, 2, NA, NA, 3, NA, NA, NA, NA),
kb = c(NA, NA, NA, 2, NA, NA, 3, NA, NA, NA), gc = c(NA,
NA, NA, 3, NA, NA, 6, NA, NA, NA), hc = c(NA, NA, 8, NA,
NA, NA, 4, NA, NA, NA)), .Names = c("ka", "kb", "gc", "hc"
), row.names = c(NA, -10L), class = "data.frame")
library(zoo) #na.locf
library(data.table)
setDT(hh)[,`:=`(ka=c(NA,NA,na.locf(ka)),kb=c(NA,NA,NA,na.locf(kb)),gc=c(NA,NA,NA,na.locf(gc)),hc=c(NA,NA,na.locf(hc)))][]
ka kb gc hc
1: NA NA NA NA
2: NA NA NA NA
3: 2 NA NA 8
4: 2 2 3 8
5: 2 2 3 8
6: 3 2 3 8
7: 3 3 6 4
8: 3 3 6 4
9: 3 3 6 4
10: 3 3 6 4
However, I am looking for use of lapply with .SD as I have more than two columns for each type. Is this possible?
Try
setDT(hh)[, lapply(.SD, function(x) na.locf(x, na.rm=FALSE))]
Or use set
for(j in seq_along(hh)){
set(hh, i=NULL, j=j, value= na.locf(hh[[j]], na.rm=FALSE))
}
You can use setnafill, available from data.table >= 1.12.3:
setnafill(hh, type = "locf")
hh
# ka kb gc hc
# 1 NA NA NA NA
# 2 NA NA NA NA
# 3 2 NA NA 8
# 4 2 2 3 8
# 5 2 2 3 8
# 6 3 2 3 8
# 7 3 3 6 4
# 8 3 3 6 4
# 9 3 3 6 4
# 10 3 3 6 4
You don't need lapply. This is sufficient:
DT <- as.data.table(hh)
DT[, na.locf(.SD, na.rm = FALSE)]
giving:
ka kb gc hc
1: NA NA NA NA
2: NA NA NA NA
3: 2 NA NA 8
4: 2 2 3 8
5: 2 2 3 8
6: 3 2 3 8
7: 3 3 6 4
8: 3 3 6 4
9: 3 3 6 4
10: 3 3 6 4
This will also work:
DT[, lapply(.SD, na.locf0)]