Customized index column - r

I have a dataset with irregular dates column. I want to create an index column. Index ID (e.g. 1) is same for three dissimilar consecutive dates then changes (e.g. to 2) for next three dissimilar consecutive dates and so on. Here is a sample of dates and how the desired column shall look like:
structure(list(Date = c(42370, 42371, 42371, 42371, 42372, 42372,
42375, 42375, 42375, 42377, 42377, 42383, 42383, 42385, 42386,
42386, 42386, 42393, 42393, 42394, 42394, 42395, 42398, 42398,
42398, 42398), Index = c(1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2,
2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4)), .Names = c("Date",
"Index"), row.names = c(NA, 26L), class = "data.frame")

Using rleid from the data.table package and cumsum:
library(data.table)
setDT(d1)[, index := (rleid(Date)-1) %% 3
][, index := cumsum(index < shift(index, fill=1))][]
gives:
Date index
1: 01-01-16 1
2: 02-01-16 1
3: 02-01-16 1
4: 02-01-16 1
5: 03-01-16 1
6: 03-01-16 1
7: 06-01-16 2
8: 06-01-16 2
9: 06-01-16 2
10: 08-01-16 2
11: 08-01-16 2
12: 14-01-16 2
13: 14-01-16 2
14: 16-01-16 3
15: 17-01-16 3
16: 17-01-16 3
17: 17-01-16 3
18: 24-01-16 3
19: 24-01-16 3
20: 25-01-16 4
21: 25-01-16 4
22: 26-01-16 4
23: 29-01-16 4
24: 29-01-16 4
25: 29-01-16 4
26: 29-01-16 4
Explanation:
The rleid function creates a runlength id. This means that every time Date changes, the runlength id is increased by 1.
By substracting 1 from the runlength id and taking the modulus of it (the %% 3 part) you get a vector of sequences of 0,1&2's.
As last step you take the cumulative sum of the comparison of the values with the previous values. When index < shift(index, fill=1) is TRUE, the cumsum function will count that as a one.
In order to better see what this code does, see the output of the following code which creates a variable for each step:
setDT(d1)[, index1 := (rleid(Date)-1) %% 3
][, index2 := cumsum(index1 < shift(index1, fill=1))][]
Used data:
d1 <- structure(list(Date = structure(c(16801, 16802, 16802, 16802, 16803, 16803, 16806,
16806, 16806, 16808, 16808, 16814, 16814, 16816,
16817, 16817, 16817, 16824, 16824, 16825, 16825,
16826, 16829, 16829, 16829, 16829), class = "Date")),
.Names = "Date", row.names = c(NA, 26L), class = "data.frame")

This constructs a grouped by 3 index for the unique values of Date and then uses character names to manage a lookup table for the conversion:
fac <- ((seq(length(unique(dat$Date)))-1) %/%3) +1
names(fac) <- unique(dat$Date)
dat$myIndex <- fac[as.character(dat$Date)]
dat
#-------
Date Index myIndex
1 42370 1 1
2 42371 1 1
3 42371 1 1
4 42371 1 1
5 42372 1 1
6 42372 1 1
7 42375 2 2
8 42375 2 2
9 42375 2 2
10 42377 2 2
11 42377 2 2
12 42383 2 2
13 42383 2 2
14 42385 3 3
15 42386 3 3
16 42386 3 3
17 42386 3 3
18 42393 3 3
19 42393 3 3
20 42394 4 4
21 42394 4 4
22 42395 4 4
23 42398 4 4
24 42398 4 4
25 42398 4 4
26 42398 4 4

base R. We can modify the rle (run-length encoding) of the object to group trios of values:
DF$index = with(rle(DF$Date), {
g = ceiling(seq_along(values)/3)
split(values, g) <- seq(tail(g,1))
inverse.rle(list(lengths = lengths, values = values))
})
The weird split(x,g) <- bit was borrowed from ave. If the Date column is increasing, this can be done more simply (thanks to #Jaap):
DF$index = ceiling(match(DF$Date, unique(DF$Date))/3) # or...
DF$index = ceiling(as.integer(factor(DF$Date))/3)
data.table. The data.table analogue is simpler:
library(data.table)
setDT(DF)[, index := ceiling(rleid(Date)/3)]

I used data from an earlier version of the question:
df <- data.frame(Date = c("01-01-16", "02-01-16", "02-01-16", "02-01-16",
"03-01-16", "03-01-16", "06-01-16", "06-01-16", "06-01-16", "08-01-16",
"08-01-16", "14-01-16", "14-01-16", "16-01-16", "17-01-16", "17-01-16",
"17-01-16", "24-01-16", "24-01-16", "25-01-16", "25-01-16", "26-01-16",
"29-01-16", "29-01-16", "29-01-16", "29-01-16"),
Index = c(1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L,
3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L))
I would start by converting the Date column from character to date, and making sure the data frame is sorted by date (you don't need that part with the new version of the data where Date is already numeric, and if you are sure the data frame is already sorted by Date):
df$Date <- as.Date(df$Date, format="%d-%m-%y")
df <- df[ order(df$Date),]
Then I would convert the date to consecutive integers - one way to do it is to convert to factor and then unclass (here I used c as a shorthand to do it) - and then cut it at equal intervals:
df$ndx <- c(factor(as.numeric(df$Date)))
df$ndx <- cut(df$ndx, seq(0.5, max(df$ndx)+0.5, by=3), labels=FALSE)

Related

Find multiple "switching points" by comparing the answers in columns

I have a data set in which subjects have made choices between A and B for 13 different B's. Below is a simplified example of what the data looks like with 54 subjects and 5 choices. (1 is A, 2 is B).
subject choice1 choice2 choice3 choice4 choice5
1 1 1 1 1 2 2
2 2 1 1 2 2 2
3 3 1 2 1 2 2
4 4 1 2 2 2 2
I would like to find the questions in which subjects switch option A to B , i.e. for subject 1 this would be choice4.
In a previous study we did this by computing number of times the subject would choose option A and then selecting the corresponding option B form a separate matrix. See code below.
However, the difference now is that instead of choosing 1 switching point, subjects were asked the questions in a randomized order, and thus there is the possibility of having multiple switching points. For example in the table above, subject 3 switches to B at choice2 and again at choice4.
I would like to find both the first time the subject switches to option B, and the last time (before sticking with B for the rest of the choices).
sure_amounts <- matrix(nrow = 4, ncol = 13) # 4 treatments, 13 questions
sure_amounts[1, ] <- c(0, 2, 3, 3.5, 4, 4.5, 5, 5.5, 6, 6.5, 7, 8, 10) # Option B's
sure_amounts[2, ] <- seq(2, 14, 1)
sure_amounts[3, ] <- seq(2, 14, 1)
sure_amounts[4, ] <- seq(2, 14, 1)
b_choice <- matrix(nrow = 201, ncol = 4)
switch_choice <- matrix(nrow = 201, ncol = 4) # switching point form A to B
for(j in 1:4){ # number of treatments
for(i in 201){ # number of subjects
choice = NULL
fl = data$ID == i
k = 1 + 36*(j-1) # 36 before going to the next treatment (due to other questions)
choice = c(data[fl,k:(k+12)])
b_choice[i,j] = length(choice[choice==1])
temp = b_choice[i,j]
switch_choice[i,j] <- ifelse(temp==0, 0, sure_amounts[j, temp])
}
}
Does anyone have any tips on how to approach this? Thanks in advance!
I am not sure how you want your expected output to look like but you can try to get data in long format and for each subject select rows where they switch from 1 -> 2.
library(dplyr)
df %>%
tidyr::pivot_longer(cols = -subject) %>%
group_by(subject) %>%
filter(value == 2 & lag(value) == 1 |
value == 1 & lead(value) == 2)
# subject name value
# <int> <chr> <int>
# 1 1 choice3 1
# 2 1 choice4 2
# 3 2 choice2 1
# 4 2 choice3 2
# 5 3 choice1 1
# 6 3 choice2 2
# 7 3 choice3 1
# 8 3 choice4 2
# 9 4 choice1 1
#10 4 choice2 2
Here we can see that subject 1 moves from 1 -> 2 from choice3 -> choice4 and so on.
data
df <- structure(list(subject = 1:4, choice1 = c(1L, 1L, 1L, 1L), choice2 = c(1L,
1L, 2L, 2L), choice3 = c(1L, 2L, 1L, 2L), choice4 = c(2L, 2L,
2L, 2L), choice5 = c(2L, 2L, 2L, 2L)), class = "data.frame",
row.names = c(NA, -4L))
A Base R solution:
Essentially this code only substracts a lag of the decisions and detects if the difference is not equal to zero.
Code:
lapply(as.data.frame(t(df_1)[-1,]), function(x){
t <- x - c(x[-1], 0) # row substracted by shortened row
z <- which(t[-length(t)] != 0) # values not equal to zero and rm last value
z + 1 # remove lag
})
# $`1`
# [1] 4
# $`2`
# [1] 3
# $`3`
# [1] 2 3 4
# $`4`
# [1] 2
Data:
df_1 <- read.table(text = " subject choice1 choice2 choice3 choice4 choice5
1 1 1 1 1 2 2
2 2 1 1 2 2 2
3 3 1 2 1 2 2
4 4 1 2 2 2 2 ", header = T)
An alternative approach:
library(dplyr)
library(stringr)
library(purrr)
df %>%
mutate(g = paste0(choice1, choice2, choice3, choice4, choice5),
switches = as.character(map(g, ~pluck(str_locate_all(.x, "12"), 1)))) %>%
select(-g)
#> subject choice1 choice2 choice3 choice4 choice5 switches
#> 1 1 1 1 1 2 2 3:4
#> 2 2 1 1 2 2 2 2:3
#> 3 3 1 2 1 2 2 c(1, 3, 2, 4)
#> 4 4 1 2 2 2 2 1:2
data
df <- structure(list(subject = 1:4, choice1 = c(1L, 1L, 1L, 1L), choice2 = c(1L,
1L, 2L, 2L), choice3 = c(1L, 2L, 1L, 2L), choice4 = c(2L, 2L,
2L, 2L), choice5 = c(2L, 2L, 2L, 2L)), class = "data.frame", row.names = c("1",
"2", "3", "4"))
Created on 2020-07-10 by the reprex package (v0.3.0)

Fill in missing rows in data in R

Suppose I have a data frame like this:
1 8
2 12
3 2
5 -6
6 1
8 5
I want to add a row in the places where the 4 and 7 would have gone in the first column and have the second column for these new rows be 0, so adding these rows:
4 0
7 0
I have no idea how to do this in R.
In excel, I could use a vlookup inside an iferror. Is there a similar combo of functions in R to make this happen?
Edit: also, suppose that row 1 was missing and needed to be filled in similarly. Would this require another solution? What if I wanted to add rows until I reached ten rows?
Use tidyr::complete to fill in the missing sequence between min and max values.
library(tidyr)
library(rlang)
complete(df, V1 = min(V1):max(V1), fill = list(V2 = 0))
#Or using `seq`
#complete(df, V1 = seq(min(V1), max(V1)), fill = list(V2 = 0))
# V1 V2
# <int> <dbl>
#1 1 8
#2 2 12
#3 3 2
#4 4 0
#5 5 -6
#6 6 1
#7 7 0
#8 8 5
If we already know min and max of the dataframe we can use them directly. Let's say we want data from V1 = 1 to 10, we can do.
complete(df, V1 = 1:10, fill = list(V2 = 0))
If we don't know the column names beforehand, we can do something like :
col1 <- names(df)[1]
col2 <- names(df)[2]
complete(df, !!sym(col1) := 1:10, fill = as.list(setNames(0, col2)))
data
df <- structure(list(V1 = c(1L, 2L, 3L, 5L, 6L, 8L), V2 = c(8L, 12L,
2L, -6L, 1L, 5L)), class = "data.frame", row.names = c(NA, -6L))

How to use column indices to collect values from columns in R

x y z column_indices
6 7 1 1,2
5 4 2 3
1 3 2 1,3
I have the column indices of the values I would like to collect in a separate column like so, what I want to create is something like this:
x y z column_indices values
6 7 1 1,2 6,7
5 4 2 3 2
1 3 2 1,3 1,2
What is the simplest way to do this in R?
Thanks!
In base R, we can use apply, split the column_indices on ',', convert them to integer and get the corresponding value from the row.
df$values <- apply(df, 1, function(x) {
inds <- as.integer(strsplit(x[4], ',')[[1]])
toString(x[inds])
})
df
# x y z column_indices values
#1 6 7 1 1,2 6, 7
#2 5 4 2 3 2
#3 1 3 2 1,3 1, 2
data
df <- structure(list(x = c(6L, 5L, 1L), y = c(7L, 4L, 3L), z = c(1L,
2L, 2L), column_indices = structure(c(1L, 3L, 2L), .Label = c("1,2",
"1,3", "3"), class = "factor")), class = "data.frame", row.names = c(NA, -3L))
One solution involving dplyr and tidyr could be:
df %>%
pivot_longer(-column_indices) %>%
group_by(column_indices) %>%
mutate(values = toString(value[1:n() %in% unlist(strsplit(column_indices, ","))])) %>%
pivot_wider(names_from = "name", values_from = "value")
column_indices values x y z
<chr> <chr> <int> <int> <int>
1 1,2 6, 7 6 7 1
2 3 2 5 4 2
3 1,3 1, 2 1 3 2

Multiple uses of setdiff() on consecutive groups without for looping

I would like to setdiff between consecutive groups without for looping, if possible with a datatable way or a function of apply family.
Dataframe df :
id group
1 L1 1
2 L2 1
3 L1 2
4 L3 2
5 L4 2
6 L3 3
7 L5 3
8 L6 3
9 L1 4
10 L4 4
11 L2 5
I want to know how much new ids there are between consecutive groups. So, for example, if we compare group 1 and 2, there are two new ids : L3 and L4 so it returns 2 (not with setdiff directly but with length()), if we compare group 2 and 3, L5 and L6 are the news ids so it returns 2 and so on.
Expected results :
new_id
2
2
2
1
Data :
structure(list(id = structure(c(1L, 2L, 1L, 3L, 4L, 3L, 5L, 6L,
1L, 4L, 2L), .Label = c("L1", "L2", "L3", "L4", "L5", "L6"), class = "factor"),
group = c(1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 5)), class = "data.frame", row.names = c(NA,
-11L), .Names = c("id", "group"))
Here is an option with mapply:
lst <- with(df, split(id, group))
mapply(function(x, y) length(setdiff(y, x)), head(lst, -1), tail(lst, -1))
#1 2 3 4
#2 2 2 1
Here is a data.table way with merge. Suppose the original data.frame is named dt:
library(data.table)
setDT(dt)
dt2 <- copy(dt)[, group := group + 1]
merge(
dt, dt2, by = 'group', allow.cartesian = T
)[, .(n = length(setdiff(id.x, id.y))), by = group]
# group n
# 1: 2 2
# 2: 3 2
# 3: 4 2
# 4: 5 1
You could use Reduce to run a comparison function on pairwise elements in a list. For example
xx<-Reduce(function(a, b) {
x <- setdiff(b$id, a$id);
list(id=b$id, new=x, newcount=length(x))
}, split(df, df$group),
acc=TRUE)[-1]
Then you can get the counts of new elements out with
sapply(xx, '[[', "newcount")
and you can get the new values with
sapply(xx, '[[', "new")
L = split(d, d$group) #Split data ('d') by group and create a list
#use lapply to access 'id' for each sub group in the list and obtain setdiff
sapply(2:length(L), function(i)
setNames(length(setdiff(L[[i]][,1], L[[i-1]][,1])),
nm = paste(names(L)[i], names(L)[i-1], sep = "-")))
#2-1 3-2 4-3 5-4
# 2 2 2 1

Summation of the corresponding number of values which are in different columns

My data frame looks like below:
df<-data.frame(alphabets1=c("A","B","C","B","C"," ","NA"),alphabets2=c("B","A","D","D"," ","E","NA"),alphabets3=c("C","F","G"," "," "," ","NA"), number = c("1","2","3","1","4","1","2"))
alphabets1 alphabets2 alphabets3 number
1 A B C 1
2 B A F 2
3 C D G 3
4 B D 1
5 C 4
6 E 1
7 NA NA NA 2
NOTE1: within the row all the values are unique, that is, below shown is not possible.
alphabets1 alphabets2 alphabets3 number
1 A A C 1
NOTE2: data frame may contains NA or is blank
I am struggling to get the below output: which is nothing but a dataframe which has the alphabets and the sum of their corresponding numbers, that is A alphabet is in 1st and 2nd rows so its sum of its corresponding number is 1+2 i.e 3 and let's say B, its in 1st, 2nd and 4th row so the sum will be 1+2+1 i.e 4.
output <-data.frame(alphabets1=c("A","B","C","D","E","F","G"), number = c("3","4","8","4","1","2","3"))
output
alphabets number
1 A 3
2 B 4
3 C 8
4 D 4
5 E 1
6 F 2
7 G 3
NOTE3: output may or may not have the NA or blanks (it doesn't matter!)
We can reshape it to 'long' format and do a group by operation
library(data.table)
melt(setDT(df), id.var="number", na.rm = TRUE, value.name = "alphabets1")[
!grepl("^\\s*$", alphabets1), .(number = sum(as.integer(as.character(number)))),
alphabets1]
# alphabets1 number
#1: A 3
#2: B 4
#3: C 8
#4: D 4
#5: E 1
#6: F 2
#7: G 3
Or we can use xtabs from base R
xtabs(number~alphabets1, data.frame(alphabets1 = unlist(df[-4]),
number = as.numeric(as.character(df[,4]))))
NOTE: In the OP's dataset, the missing values were "NA", and not real NA and the 'number' column is factor (which was changed by converting to integer for doing the sum)
data
df <- data.frame(alphabets1=c("A","B","C","B","C"," ",NA),
alphabets2=c("B","A","D","D"," ","E",NA),
alphabets3=c("C","F","G"," "," "," ",NA),
number = c("1","2","3","1","4","1","2"))
Here is a base R method using sapply and table. I first converted df$number into a numeric. See data section below.
data.frame(table(sapply(df[-length(df)], function(i) rep(i, df$number))))
Var1 Freq
1 11
2 A 3
3 B 4
4 C 8
5 D 4
6 E 1
7 F 2
8 G 3
9 NA 6
To make the output a little bit nicer, we could wrap a few more functions and perform a subsetting within sapply.
data.frame(table(droplevels(unlist(sapply(df[-length(df)],
function(i) rep(i[i %in% LETTERS],
df$number[i %in% LETTERS])),
use.names=FALSE))))
Var1 Freq
1 A 3
2 B 4
3 C 8
4 D 4
5 E 1
6 F 2
7 G 3
It may be easier to do this afterward, though.
data
I ran
df$number <- as.numeric(df$number)
on the OP's data resulting in this.
df <-
structure(list(alphabets1 = structure(c(2L, 3L, 4L, 3L, 4L, 1L,
5L), .Label = c(" ", "A", "B", "C", "NA"), class = "factor"),
alphabets2 = structure(c(3L, 2L, 4L, 4L, 1L, 5L, 6L), .Label = c(" ",
"A", "B", "D", "E", "NA"), class = "factor"), alphabets3 = structure(c(2L,
3L, 4L, 1L, 1L, 1L, 5L), .Label = c(" ", "C", "F", "G", "NA"
), class = "factor"), number = c(1, 2, 3, 1, 4, 1, 2)), .Names = c("alphabets1",
"alphabets2", "alphabets3", "number"), row.names = c(NA, -7L), class = "data.frame")

Resources