How to merge two data frames by ranges in R? - r

Suppose I have two data frames such like:
set.seed(123)
df0<-data.frame(pos=3:12,
count0=rbinom(10, 50, 0.5),
count2=rbinom(10, 20, 0.5))
df0
pos count0 count2
1 3 23 14
2 4 28 10
3 5 24 11
4 6 29 10
5 7 30 7
6 8 19 13
7 9 25 8
8 10 29 6
9 11 25 9
10 12 25 14
df1<-data.frame(start=c(4, 7, 11, 14),
end=c(6, 9, 12, 15),
cnv=c(1, 2, 3, 4))
df1
start end cnv
1 4 6 1
2 7 9 2
3 11 12 3
4 14 15 4
What I want is to merge df0 and df1 using the df0$pos with the ranges ofdf1$start and df1$end. If the pos falls into the range of start:end, fills in the cnv from df1 otherwise set cnv as zeros. An output from the above example would be:
pos count0 count2 cnv
1 3 23 14 0
2 4 28 10 1
3 5 24 11 1
4 6 29 10 1
5 7 30 7 2
6 8 19 13 2
7 9 25 8 2
8 10 29 6 0
9 11 25 9 3
10 12 25 14 3

We can use sapply to find if there is an index which is present in range else return 0.
df0$cnv <- sapply(df0$pos, function(x) {
inds <- x >= df1$start & x <= df1$end
if (any(inds))
df1$cnv[inds]
else 0
})
df0
# pos count0 count2 cnv
#1 3 23 14 0
#2 4 28 10 1
#3 5 24 11 1
#4 6 29 10 1
#5 7 30 7 2
#6 8 19 13 2
#7 9 25 8 2
#8 10 29 6 0
#9 11 25 9 3
#10 12 25 14 3

Related

Adding a sequential value using a loop

I have a large dataframe which is effectively combined output from a nested list using do.call(rbind, nested_list)
The output has the same number of rows for each list element (e.g. 5 rows per list) and I need to add a column which has a unique numeric code for each list (or group). How can I write a loop to reproduce the group column I have included in the example below, e.g. the five rows have a group value == 1, rows 6 to 10 have a group value == 2, rows 11 to 15 have a group value == 3
df <- data.frame("ID" = 1:15)
df$Var_A <- c(1,3,5,7,9,11,13,15,17,19,21,23,25,27,29)
df$Var_B <- c(10,0,0,0,12,12,12,12,0,14,NA_real_,14,16,16,16)
df$Var_C <- c(10,12,14,16,10,12,14,16,10,12,14,16,10,12,14)
df$Var_D <- c(10,12,14,16,10,12,14,16,10,12,14,16,10,12,14)
df$New_A <- c(2,5,5,8,11,14,15,17,20,21,22,23,25,25,27)
df$New_B <- c(10,0,0,0,12,12,12,12,0,14,NA_real_,14,16,16,16)
df$New_C <- c(10,12,14,16,10,12,14,16,10,12,14,16,10,12,14)
df$New_D <- c(10,12,14,16,10,12,14,16,10,12,14,16,10,12,14)
df$Group <- c(1,1,1,1,1,2,2,2,2,2,3,3,3,3,3)
ID Var_A Var_B Var_C Var_D New_A New_B New_C New_D Group
1 1 1 10 10 10 2 10 10 10 1
2 2 3 0 12 12 5 0 12 12 1
3 3 5 0 14 14 5 0 14 14 1
4 4 7 0 16 16 8 0 16 16 1
5 5 9 12 10 10 11 12 10 10 1
6 6 11 12 12 12 14 12 12 12 2
7 7 13 12 14 14 15 12 14 14 2
8 8 15 12 16 16 17 12 16 16 2
9 9 17 0 10 10 20 0 10 10 2
10 10 19 14 12 12 21 14 12 12 2
11 11 21 NA 14 14 22 NA 14 14 3
12 12 23 14 16 16 23 14 16 16 3
13 13 25 16 10 10 25 16 10 10 3
14 14 27 16 12 12 25 16 12 12 3
15 15 29 16 14 14 27 16 14 14 3
You can use the ceiling function:
df <- data.frame("ID" = 1:15)
df$Var_A <- c(1,3,5,7,9,11,13,15,17,19,21,23,25,27,29)
df$Var_B <- c(10,0,0,0,12,12,12,12,0,14,NA_real_,14,16,16,16)
df$Var_C <- c(10,12,14,16,10,12,14,16,10,12,14,16,10,12,14)
df$Var_D <- c(10,12,14,16,10,12,14,16,10,12,14,16,10,12,14)
df$New_A <- c(2,5,5,8,11,14,15,17,20,21,22,23,25,25,27)
df$New_B <- c(10,0,0,0,12,12,12,12,0,14,NA_real_,14,16,16,16)
df$New_C <- c(10,12,14,16,10,12,14,16,10,12,14,16,10,12,14)
df$New_D <- c(10,12,14,16,10,12,14,16,10,12,14,16,10,12,14)
df$Group <- ceiling(as.numeric(df$ID)/5)
df
# ID Var_A Var_B Var_C Var_D New_A New_B New_C New_D Group
# 1 1 1 10 10 10 2 10 10 10 1
# 2 2 3 0 12 12 5 0 12 12 1
# 3 3 5 0 14 14 5 0 14 14 1
# 4 4 7 0 16 16 8 0 16 16 1
# 5 5 9 12 10 10 11 12 10 10 1
# 6 6 11 12 12 12 14 12 12 12 2
# 7 7 13 12 14 14 15 12 14 14 2
# 8 8 15 12 16 16 17 12 16 16 2
# 9 9 17 0 10 10 20 0 10 10 2
# 10 10 19 14 12 12 21 14 12 12 2
# 11 11 21 NA 14 14 22 NA 14 14 3
# 12 12 23 14 16 16 23 14 16 16 3
# 13 13 25 16 10 10 25 16 10 10 3
# 14 14 27 16 12 12 25 16 12 12 3
# 15 15 29 16 14 14 27 16 14 14 3
Without adding an ID or rownums we can do this using nrow and knowledge of the group length.
group_len <- 5
groups <- nrow(df)/group_len
df$group <- rep(1:groups, each = group_len)
# Example:
# rep(1:3, each = 5)
# 1 1 1 1 1 2 2 2 2 2 3 3 3 3 3
I'd use dplyr::mutate with dplyr::row_number:
library(dplyr)
df %>%
mutate(Group=ceiling(row_number() / 5))
Output:
ID Var_A Var_B Var_C Var_D New_A New_B New_C New_D Group
1 1 1 10 10 10 2 10 10 10 1
2 2 3 0 12 12 5 0 12 12 1
3 3 5 0 14 14 5 0 14 14 1
4 4 7 0 16 16 8 0 16 16 1
5 5 9 12 10 10 11 12 10 10 1
6 6 11 12 12 12 14 12 12 12 2
7 7 13 12 14 14 15 12 14 14 2
8 8 15 12 16 16 17 12 16 16 2
9 9 17 0 10 10 20 0 10 10 2
10 10 19 14 12 12 21 14 12 12 2
11 11 21 NA 14 14 22 NA 14 14 3
12 12 23 14 16 16 23 14 16 16 3
13 13 25 16 10 10 25 16 10 10 3
14 14 27 16 12 12 25 16 12 12 3
15 15 29 16 14 14 27 16 14 14 3
An option would be to combine cumsum with rep.
cumsum(rep_len(c(TRUE, rep(FALSE, 4)), nrow(df)))
#cumsum(rep_len(c(TRUE, FALSE, FALSE, FALSE, FALSE), nrow(df))) #Alternative
# [1] 1 1 1 1 1 2 2 2 2 2 3 3 3 3 3
Or making use of auto repeat.
df$Group <- c(TRUE, rep(FALSE, 4))
df$Group <- cumsum(df$Group)
df$Group
# [1] 1 1 1 1 1 2 2 2 2 2 3 3 3 3 3
Or create a sequence with length of nrow and make an integer division %/%.
0:(nrow(df)-1) %/% 5
#seq(0, nrow(df)-1) %/% 5 #Alternative
#(seq_len(nrow(df))-1) %/% 5 #Alternative
# [1] 0 0 0 0 0 1 1 1 1 1 2 2 2 2 2
Or using rep:
rep(1:ceiling(nrow(df)/5), each=5, length.out=nrow(df))
# [1] 1 1 1 1 1 2 2 2 2 2 3 3 3 3 3
You could use the cut function with labels = FALSE to return an integer to use for the group.
n_per_group <- 5
df$group <- cut(x = df$ID, breaks = nrow(df) / n_per_group, labels = FALSE)
df$group
#[1] 1 1 1 1 1 2 2 2 2 2 3 3 3 3 3

Fill zeros for missing values in R

I am trying to deal with this problem.
I have a df with a date column and I want to count the occurences per hour. Here is what I've done:
x <- df %>%
mutate(hora = hour(date)) %>%
select(hora) %>%
count(hora)
that gives as a result:
> x
# A tibble: 19 x 2
hora n
<int> <int>
1 0 1
2 1 1
3 3 1
4 8 4
5 9 7
6 10 10
7 11 14
8 12 10
9 13 8
10 14 4
11 15 5
12 16 12
13 17 4
14 18 12
15 19 9
16 20 5
17 21 2
18 22 4
19 23 4
As you can see, there are hours that don't show up that would have n=0, like 2 or 4:7. What I want is it to add the hours that are not in x with n=0 so the table is complete.
The expected output should be something like this:
hora n
1 0 12
2 1 3
3 2 5
4 3 7
5 4 8
6 5 1
7 6 0
8 7 11
9 8 6
10 9 10
11 10 9
12 11 0
13 12 0
14 13 3
15 14 0
16 15 7
17 16 8
18 17 1
19 18 2
20 19 11
21 20 6
22 21 10
23 22 9
24 23 4
I tried creating a table with hours 0:23 and all n=0 and trying to sum the two tables but obviously that didn't work. I also tried x$hour <- 0:23, thinking that the missing values would be added, but it didn't work as well.
You could convert hora to factor and use .drop = FALSE in count
library(dplyr)
library(lubridate)
df %>%
mutate(hora = factor(hour(date), levels = 0:23)) %>%
count(hora, .drop = FALSE)
Another option is to use complete :
df %>%
mutate(hora = hour(date)) %>%
count(hora) %>%
tidyr::complete(hora = 0:23, fill = list(n = 0))
A solution in Base R merges a vector of hours with the summarized data, and sets the missing counts to 0.
textFile <- "row hour count
1 0 1
2 1 1
3 3 1
4 8 4
5 9 7
6 10 10
7 11 14
8 12 10
9 13 8
10 14 4
11 15 5
12 16 12
13 17 4
14 18 12
15 19 9
16 20 5
17 21 2
18 22 4
19 23 4"
data <- read.table(text = textFile,header = TRUE)[-1]
hours <- data.frame(hour = 0:23)
merged <- merge(data,hours,all.y = TRUE)
merged[is.na(merged$count),"count"] <- 0
...and the output:
> head(merged)
hour count
1 0 1
2 1 1
3 2 0
4 3 1
5 4 0
6 5 0
>

Split into groups based on (multiple) conditions?

I have set of marbles, of different colors and weights, and I want to split them into groups based on their weight and color.
The conditions are:
A group cannot weigh more than 100 units
A group cannot have more than 5 different-colored marbles.
A reproducible example:
marbles <- data.frame(color=sample(1:20, 20), weight=sample(1:40, 20, replace=T))
color weight
1 1 22
2 15 33
3 13 35
4 11 13
5 6 26
6 8 15
7 10 3
8 16 22
9 14 21
10 3 16
11 4 26
12 20 30
13 9 31
14 2 16
15 7 12
16 17 13
17 19 19
18 5 17
19 12 12
20 18 40
And what I want is this group column:
color weight group
1 1 22 1
2 15 33 1
3 13 35 1
4 11 13 2
5 6 26 2
6 8 15 2
7 10 3 2
8 16 22 2
9 14 21 3
10 3 16 3
11 4 26 3
12 20 30 3
13 9 31 4
14 2 16 4
15 7 12 4
16 17 13 4
17 19 19 4
18 5 17 5
19 12 12 5
20 18 40 5
TIA.
The below isn't an optimal assignment to the groups, it just does it sequentially through the data frame. It's uses rowwise and might not be the most efficient way as it's not a vectorized approach.
library(dplyr)
marbles <- data.frame(color=sample(1:20, 20), weight=sample(1:40, 20, replace=T))
Below I create a rowwise function which we can apply using dplyr
assign_group <- function(color, weight) {
# Conditions
clists = append(color_list, color)
sum_val = group_sum + weight
num_colors = length(unique(color_list))
assign_condition = (sum_val <= 100 & num_colors <= 5)
#assign globals
cval <- if(assign_condition) clists else c(color)
sval <- ifelse(assign_condition, sum_val, weight)
gval <- ifelse(assign_condition, group_number, group_number + 1)
assign("color_list", cval, envir = .GlobalEnv)
assign("group_sum", sval, envir = .GlobalEnv)
assign("group_number", gval, envir = .GlobalEnv)
res = group_number
return(res)
}
I then setup a few global variables to track the allocation of the marbles to each group.
# globals
color_list <<- c()
group_sum <<- 0
group_number <<- 1
Finally run this function using mutate
test <- marbles %>% rowwise() %>% mutate(group = assign_group(color,weight)) %>% data.frame()
Which results in the below
color weight group
1 6 27 1
2 12 16 1
3 15 32 1
4 20 25 1
5 19 5 2
6 2 21 2
7 16 39 2
8 17 4 2
9 11 16 2
10 7 7 3
11 10 5 3
12 1 30 3
13 13 7 3
14 9 39 3
15 14 7 4
16 8 17 4
17 18 9 4
18 4 36 4
19 3 1 4
20 5 3 5
And seems to meet the constraints
test %>% group_by(group) %>% summarise(tot_w = sum(weight), n_c = length(unique(color)) )
group tot_w n_c
<dbl> <int> <int>
1 1 100 4
2 2 85 5
3 3 88 5
4 4 70 5
5 5 3 1
in base R you could write a recursive function as shown below:
create_group = function(df,a){
if(missing(a)) a = cumsum(df$weight)%/%100
b = !ave(df$color,a,FUN=seq_along)%%6
d = ave(df$weight,a+b,FUN=cumsum)>100
a = a+b+d
if (any(b|d)) create_group(df,a) else cbind(df,group = a+1)
}
create_group(df)
color weight group
1 1 22 1
2 15 33 1
3 13 35 1
4 11 13 2
5 6 26 2
6 8 15 2
7 10 3 2
8 16 22 2
9 14 21 3
10 3 16 3
11 4 26 3
12 20 30 3
13 9 31 4
14 2 16 4
15 7 12 4
16 17 13 4
17 19 19 4
18 5 17 5
19 12 12 5
20 18 40 5

Group rows and add sum column of unique values

Here an example of my data.frame:
df = read.table(text='colA colB colC
10 11 7
10 34 7
10 89 7
10 21 7
2 23 5
2 21 5
2 56 5
22 14 3
22 19 3
22 90 3
11 19 2
11 45 2
1 45 0
1 23 0
9 8 0
9 11 0
9 21 0', header = TRUE)
I need to group the rows by colA and colC and add a new column which states the sum of unique values based on colB.
In steps here what I need to do for this specific data.frame:
group rows with colA = 10 and 9, colA = 2 and 1, colA = 22 and colA = 11;
find the unique values of colB per each group;
add the unique values in a new col (newcolD).
Note that colC states the total number of observations for colA = 10 and 9, colA = 2 and 1, colA = 22 and colA = 11.
The data.frame needs to remain ordered decreasingly by colC.
My expected output is:
colA colB colC newcolD
10 11 7 5
10 34 7 5
10 89 7 5
10 21 7 5
9 8 0 5
9 11 0 5
9 21 0 5
2 23 5 4
2 21 5 4
2 56 5 4
1 45 0 4
1 23 0 4
22 14 3 3
22 19 3 3
22 90 3 3
11 19 2 2
11 45 2 2
To note that in df the colB duplicated values are: 11 and 21 for group 10 and 9, and 23 for group 2 and 1.
You can do that with dplyr. The trick is to create a new grouping column which groups consecutive values in colA. This is done with cumsum(c(1, diff(colA) < -1) in the example below.
df1 = read.table(text='colA colB colC
10 11 7
10 34 7
10 89 7
10 21 7
2 23 5
2 21 5
2 56 5
22 14 3
22 19 3
22 90 3
1 45 0
1 23 0
9 8 0
9 11 0
9 21 0', header = TRUE,stringsAsFactors=FALSE)
library(dplyr)
df1 %>%
arrange(desc(colA)) %>%
group_by(group_sequential = cumsum(c(1, diff(colA) < -1))) %>%
mutate(newcolD=n_distinct(colB))
colA colB colC group_sequential newcolD
<int> <int> <int> <dbl> <int>
1 22 14 3 1 3
2 22 19 3 1 3
3 22 90 3 1 3
4 10 11 7 2 5
5 10 34 7 2 5
6 10 89 7 2 5
7 10 21 7 2 5
8 9 8 0 2 5
9 9 11 0 2 5
10 9 21 0 2 5
11 2 23 5 3 4
12 2 21 5 3 4
13 2 56 5 3 4
14 1 45 0 3 4
15 1 23 0 3 4
EDIT FOR NEW DATA
With the data you added, we need to create a custom grouping. I use case_when in the example below. This matches the order you show in the desired output column. In the text, you wrote that you wanted the table to be sorted by colC. To do so, change the last line to arrange(desc(colC))
df1 = read.table(text='colA colB colC
10 11 7
10 34 7
10 89 7
10 21 7
2 23 5
2 21 5
2 56 5
22 14 3
22 19 3
22 90 3
11 19 2
11 45 2
1 45 0
1 23 0
9 8 0
9 11 0
9 21 0', header = TRUE,stringsAsFactors=FALSE)
library(dplyr)
df1 %>%
group_by(group_sequential = case_when(.$colA==10|.$colA==9~1,
.$colA==2|.$colA==1~2,
.$colA==22~3,
.$colA==11~4)) %>%
mutate(newcolD=n_distinct(colB)) %>%
arrange(desc(newcolD))
colA colB colC group_sequential newcolD
<int> <int> <int> <dbl> <int>
1 10 11 7 1 5
2 10 34 7 1 5
3 10 89 7 1 5
4 10 21 7 1 5
5 9 8 0 1 5
6 9 11 0 1 5
7 9 21 0 1 5
8 2 23 5 2 4
9 2 21 5 2 4
10 2 56 5 2 4
11 1 45 0 2 4
12 1 23 0 2 4
13 22 14 3 3 3
14 22 19 3 3 3
15 22 90 3 3 3
16 11 19 2 4 2
17 11 45 2 4 2
You're really not making it easy for us, reposting slight variations of the same question instead of updating the old one and presenting conditions that are vague and inconsistent with what the desired output implies. Anyhow, here is my attempt. This is more an answer to the second question you posted, as that was a bit more general in form.
It's a bit messy, it's pretty much a direct translation of your conditions into a for loop with some if statements. I chose to focus on your written conditions rather than the expected output as that was the easier one to understand. If you want a better answer, please consider cleaning up you question(s) considerably.
df1 <- read.table(text="
colA colB colC
10 11 7
10 34 7
10 89 7
10 21 7
2 23 5
2 21 5
2 56 5
22 14 3
22 19 3
22 90 3
11 19 2
11 45 2
1 45 0
1 23 0
9 8 0
9 11 0
9 21 0", header=TRUE)
df2 <- read.table(text="
colA colB colC
10 11 7
10 34 7
10 89 7
10 21 7
2 23 5
2 21 5
2 56 5
33 24 3
33 78 3
22 14 3
22 19 3
22 90 3
11 19 2
11 45 2
1 45 0
1 23 0
9 8 0
9 11 0
9 21 0
32 11 0", header=TRUE)
df <- df1
for (i in 1:nrow(df)) {
df$colD[i] <- ifelse(df$colC[i] == 0,
0,
length(unique(df$colA[1:i])))
if (any(df$colA[i]-1 == df$colA[1:i]) & df$colC[i] != 0) {
df$colD[i] <- df$colD[which(df$colA[i]-1 == df$colA[1:i])][1]
}
}
# colA colB colC colD
# 10 11 7 1
# 10 34 7 1
# 10 89 7 1
# 10 21 7 1
# 2 23 5 2
# 2 21 5 2
# 2 56 5 2
# 22 14 3 3
# 22 19 3 3
# 22 90 3 3
# 11 19 2 1
# 11 45 2 1
# 1 45 0 0
# 1 23 0 0
# 9 8 0 0
# 9 11 0 0
# 9 21 0 0
df <- df2
for (i in 1:nrow(df)) {
df$colD[i] <- ifelse(df$colC[i] == 0,
0,
length(unique(df$colA[1:i])))
if (any(df$colA[i]-1 == df$colA[1:i]) & df$colC[i] != 0) {
df$colD[i] <- df$colD[which(df$colA[i]-1 == df$colA[1:i])][1]
}
}
df
# colA colB colC colD
# 10 11 7 1
# 10 34 7 1
# 10 89 7 1
# 10 21 7 1
# 2 23 5 2
# 2 21 5 2
# 2 56 5 2
# 33 24 3 3
# 33 78 3 3
# 22 14 3 4
# 22 19 3 4
# 22 90 3 4
# 11 19 2 1
# 11 45 2 1
# 1 45 0 0
# 1 23 0 0
# 9 8 0 0
# 9 11 0 0
# 9 21 0 0
# 32 11 0 0
To also group the rows where colC is zero, it's sufficient to adjust the conditionals like this:
for (i in 1:nrow(df)) {
df$colD[i] <- length(unique(df$colA[1:i]))
if (any(df$colA[i]-1 == df$colA[1:i])) {
df$colD[i] <- df$colD[which(df$colA[i]-1 == df$colA[1:i])][1]
}
}

R - Index position with condition

I've a data frame like this
w<-c(0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0)
i would like an index position starting after value 1.
output : NA,NA,NA,NA,NA,1,2,3,4,5,6,7,1,2,3,4,5,1,2,3,4,5,6,7,8,9
ideally applicable to a data frame.
Thanks
edit : w is a data frame,
roughly this function
m<-as.data.frame(w)
m[m!=1] <- row(m)[m!=1]
m
w
1 1
2 2
3 3
4 4
5 5
6 1
7 7
8 8
9 9
10 10
11 11
12 12
13 1
14 14
15 15
16 16
17 17
18 1
19 19
20 20
21 21
22 22
23 23
24 24
25 25
26 26
but with a return to 1 when value 1 is matching.
> m
w wanted
1 1 NA
2 2 NA
3 3 NA
4 4 NA
5 5 NA
6 1 1
7 7 2
8 8 3
9 9 4
10 10 5
11 11 6
12 12 7
13 1 1
14 14 2
15 15 3
16 16 4
17 17 5
18 1 1
19 19 2
20 20 3
21 21 4
22 22 5
23 23 6
24 24 7
25 25 8
26 26 9
Thanks
This assumes that the data is ordered in the way shown in example.
m$wanted <- with(m, ave(w, cumsum(c(TRUE,diff(w) <0)), FUN=seq_along))
m$wanted
#[1] 1 2 3 4 5 1 2 3 4 5 6 7 1 2 3 4 5 1 2 3 4 5 6 7 8 9
For the given data including repeated 1's and non-sequential input, the following works:
m[9,1] <- 100
m[3,1] <- 55
m[14,1] <- 60
m[14,1] <- 60
m[25,1] <- 1
m[19,1] <- 1
m$result <- 1:nrow(m) - which(m$w == 1)[cumsum(m$w == 1)] + 1
But if the data does not start on 1:
m[1,1] <- 2
Then this works:
firstone <- which(m$w == 1)[1]
subindex <- m[firstone:nrow(m),'w'] == 1
m$result <- c(rep(NA,firstone-1),1:length(subindex) - which(subindex)[cumsum(subindex)] + 1)

Resources