Add character vector based on reference data frame parameters - r

I have a data frame df1:
chr = c( 1,1,1,1,2,2,2,2)
point = c (257,752,135,1650,252,756,1230,1710)
df1 = data.frame(chr, point)
chr point
1 1 257
2 1 752
3 1 135
4 1 1650
5 2 252
6 2 756
7 2 1230
8 2 1710
I would like to add a new column to this called name. The name to be allocated comes from a reference data frame df2:
chrB = c( 1,1,1,1,2,2,2,2)
txstart = c(0,501,1001,1501,0,501,1001,1501)
txstop = c(500,1000,1500,2000,500,1000,1500,2000)
name2 = c("F","W","Q","G","V","S","L","Y")
chrB txstart txstop name2
1 2 0 500 F
2 2 501 1000 W
3 2 1001 1500 Q
4 2 1501 2000 G
5 1 0 500 V
6 1 501 1000 S
7 1 1001 1500 L
8 1 1501 2000 Y
Where chr in df1 is the same as chrB in df2 AND point in df1 lies between values txstart and txstop the name2 in df2 should be added to df1. result I would like is below:
chr point name
1 1 257 V
2 1 752 S
3 1 135 L
4 1 1650 Y
5 2 252 F
6 2 756 W
7 2 1230 Q
8 2 1710 G
Any help much appreciated!!!

With the updated dataset only the foverlaps method works:
dt1 <- data.table(chr, mp1 = point, mp2 = point,
key = c("chr","mp1", "mp2"))
dt2 <- data.table(chrB, txstart, txstop, name2,
key = c("chrB","txstart", "txstop"))
foverlaps(dt1, dt2, type="within")[, .(chr, midpoint=mp1, name=name2)][]
which gives:
chr midpoint name
1: 1 135 F
2: 1 257 F
3: 1 752 W
4: 1 1650 G
5: 2 252 V
6: 2 756 S
7: 2 1230 L
8: 2 1710 Y
Old answer:
When you want to look whether the midpoint is between the start and stop point of df2, you could use:
df1$name <- df2$name2[match(df1$chr,df2$chrB) &
df1$midpoint > df2$txstart &
df1$midpoint < df2$txstop]
which gives:
> df1
chr midpoint name
1 1 250 F
2 1 750 W
3 1 1250 Q
4 1 1750 G
5 2 250 V
6 2 750 S
7 2 1250 L
8 2 1750 Y
As an alternative approach, you could use the foverlaps function from the data.table package:
library(data.table)
dt1 <- data.table(chr, mp1 = midpoint, mp2 = midpoint, key = c("chr","mp1", "mp2"))
dt2 <- data.table(chrB, txstart, txstop, name2, key = c("chrB","txstart", "txstop"))
foverlaps(dt1, dt2, type="within", nomatch=0L)[, .(chr, midpoint=mp1, name=name2)][]
which gives the same result:
chr midpoint name
1: 1 250 F
2: 1 750 W
3: 1 1250 Q
4: 1 1750 G
5: 2 250 V
6: 2 750 S
7: 2 1250 L
8: 2 1750 Y

Related

Complete missing column values with values from another table [duplicate]

I have a data frame (datadf) with 3 columns, 'x', 'y, and z. Several 'x' values are missing (NA). 'y' and 'z' are non measured variables.
x y z
153 a 1
163 b 1
NA d 1
123 a 2
145 e 2
NA c 2
NA b 1
199 a 2
I have another data frame (imputeddf) with the same three columns:
x y z
123 a 1
145 a 2
124 b 1
168 b 2
123 c 1
176 c 2
184 d 1
101 d 2
I wish to replace NA in 'x' in 'datadf' with values from 'imputeddf' where 'y' and 'z' matches between the two data sets (each combo of 'y' and 'z' has its own value of 'x' to fill in).
The desired result:
x y z
153 a 1
163 b 1
184 d 1
123 a 2
145 e 2
176 c 2
124 b 1
199 a 2
I am trying things like:
finaldf <- datadf
finaldf$x <- if(datadf[!is.na(datadf$x)]){ddply(datadf, x=imputeddf$x[datadf$y == imputeddf$y & datadf$z == imputeddf$z])}else{datadf$x}
but it's not working.
What is the best way for me to fill in the NA in the using my imputed value df?
I would do this:
library(data.table)
setDT(DF1); setDT(DF2)
DF1[DF2, x := ifelse(is.na(x), i.x, x), on=c("y","z")]
which gives
x y z
1: 153 a 1
2: 163 b 1
3: 184 d 1
4: 123 a 2
5: 145 e 2
6: 176 c 2
7: 124 b 1
8: 199 a 2
Comments. This approach isn't so great, since it merges the whole of DF1, while we only need to merge the subset where is.na(x). Here, the improvement looks like (thanks, #Arun):
DF1[is.na(x), x := DF2[.SD, x, on=c("y", "z")]]
This way is analogous to #RHertel's answer.
From #Jakob's comment:
does this work for more than one x variable? If I want to fill up entire datasets with several columns?
You can enumerate the desired columns:
DF1[DF2, `:=`(
x = ifelse(is.na(x), i.x, x),
w = ifelse(is.na(w), i.w, w)
), on=c("y","z")]
The expression could be constructed using lapply and substitute, probably, but if the set of columns is fixed, it might be cleanest just to write it out as above.
Here's an alternative with base R:
df1[is.na(df1$x),"x"] <- merge(df2,df1[is.na(df1$x),][,c("y","z")])$x
> df1
# x y z
#1 153 a 1
#2 163 b 1
#3 124 b 1
#4 123 a 2
#5 145 e 2
#6 176 c 2
#7 184 d 1
#8 199 a 2
A dplyr solution, conceptually identical to the answers above. To pull out just the rows of imputeddf that correspond to NAs in datadf, use semi_join. Then, use another join to match back to datadf. (This step is not very clean, unfortunately.)
library(dplyr)
replacement_rows <- imputeddf %>%
semi_join(datadf %>% filter(is.na(x)), by = c("y", "z"))
datadf <- datadf %>%
left_join(replacement_rows, by = c("y", "z")) %>%
mutate(x = if_else(is.na(x.x), x.y, x.x)) %>%
select(x, y, z)
This gets what you want:
> datadf
# A tibble: 8 x 3
x y z
<dbl> <chr> <dbl>
1 153 a 1
2 163 b 1
3 184 d 1
4 123 a 2
5 145 e 2
6 176 c 2
7 124 b 1
8 199 a 2
In dplyr, you can use rows_patch to update NAs:
rows_patch(datadf, imputeddf, by = c("y", "z"), unmatched = "ignore")
# x y z
# 1 153 a 1
# 2 163 b 1
# 3 184 d 1
# 4 123 a 2
# 5 145 e 2
# 6 176 c 2
# 7 124 b 1
# 8 199 a 2
data:
datadf <- read.table(header = T, text = "x y z
153 a 1
163 b 1
NA d 1
123 a 2
145 e 2
NA c 2
NA b 1
199 a 2")
imputeddf <- read.table(header = T, text = " x y z
123 a 1
145 a 2
124 b 1
168 b 2
123 c 1
176 c 2
184 d 1
101 d 2")

How to split a dataframe into a list of dataframes based on distinct value ranges

I want to split a dataframe into a list of dataframes based on distinct ranges of a numeric variable.
ILLUSTRATIVE DATA:
set.seed(123)
df <- data.frame(
subject = LETTERS[1:10],
weight = sample(1:1000, 10)
)
df
subject weight
1 A 288
2 B 788
3 C 409
4 D 881
5 E 937
6 F 46
7 G 525
8 H 887
9 I 548
10 J 453
I'd like to have a list of 4 smaller dataframes based on these limits of the variable weight:
limits <- c(250, 500, 750, 1000)
That is, what I'm after, in the list of dataframes, is one dataframe where weight is in the range of 0-250, another where weight ranges between 251-500, another where the range is from 501-750, and so on--in other words, the ranges are distinct.
What I've tried so far is this dyplr solution, which outputs a list of 5 dataframes but with cumulative ranges:
limits <- c(250, 500, 750, 1000)
lapply(limits, function(x) {df %>% filter(weight <= x)})
[[1]]
[1] subject weight
<0 rows> (or 0-length row.names)
[[2]]
subject weight
1 F 46
[[3]]
subject weight
1 A 288
2 C 409
3 F 46
4 J 453
[[4]]
subject weight
1 A 288
2 C 409
3 F 46
4 G 525
5 I 548
6 J 453
[[5]]
subject weight
1 A 288
2 B 788
3 C 409
4 D 881
5 E 937
6 F 46
7 G 525
8 H 887
9 I 548
10 J 453
How could this code be fixed, or which other code can be used, so that a list of dataframes is obtained based on distinct weight ranges?
Perhaps:
library(dplyr)
df %>%
group_split(group = findInterval(weight, limits))
Output:
[4]>
[[1]]
# A tibble: 4 x 3
subject weight group
<fct> <int> <int>
1 C 179 0
2 E 195 0
3 H 118 0
4 J 229 0
[[2]]
# A tibble: 3 x 3
subject weight group
<fct> <int> <int>
1 A 415 1
2 B 463 1
3 I 299 1
[[3]]
# A tibble: 1 x 3
subject weight group
<fct> <int> <int>
1 D 526 2
[[4]]
# A tibble: 2 x 3
subject weight group
<fct> <int> <int>
1 F 938 3
2 G 818 3
Just use keep = FALSE as additional argument to group_split if you want to remove the group column in your output.
A base R one-liner can split the data by limits.
split(df, findInterval(df$weight, limits))
#$`0`
# subject weight
#3 C 179
#5 E 195
#8 H 118
#10 J 229
#
#$`1`
# subject weight
#1 A 415
#2 B 463
#9 I 299
#
#$`2`
# subject weight
#4 D 526
#
#$`3`
# subject weight
#6 F 938
#7 G 818

Select nth observation and sum by group using data.table

I would like to turn the first table into the second by selecting the last observation of a group for a and b, the first observation for c, sum each observation for the group for d and e, and for f, check if a valid date exists and use that date.
Table 1:
ID a b c d e f
1 10 100 1000 10000 100000 ?
1 10 100 1001 10010 100100 5/07/1977
1 11 111 1002 10020 100200 5/07/1977
2 22 222 2000 20000 200000 6/02/1980
3 33 333 3000 30000 300000 20/12/1978
3 33 333 3001 30010 300100 ?
4 40 400 4000 40000 400000 ?
4 40 400 4001 40010 400100 ?
4 40 400 4002 40020 400200 7/06/1944
4 44 444 4003 40030 400300 ?
4 44 444 4004 40040 400400 ?
4 44 444 4005 40050 400500 ?
5 55 555 5000 50000 500000 31/05/1976
5 55 555 5001 50010 500100 31/05/1976
Table 2:
ID a b c d e f
1 11 111 1000 30030 300300 5/07/1977
2 22 222 2000 20000 200000 6/02/1980
3 33 333 3000 60010 600100 20/12/1978
4 44 444 4000 240150 2401500 7/06/1944
5 55 555 5000 100010 1000100 31/05/1976
I have looked up StackOverflow questions and I have only seen elements of this. I can do a through to e in the following steps.
library(data.table)
setwd('D:/Work/BRB/StackOverflow')
DT = data.table(fread('datatable.csv', header=TRUE))
AB = DT[ , .SD[.N], ID ]
AB = AB[ , c('a', 'b') ]
C = DT[ , .SD[1], ID ]
C = C[ , 'c' ]
DE = DT[ , .(d = sum(d), e = sum(e)) , by = ID ]
Final = cbind(AB, C, DE)
Final
My question is, can I do the operations on variables a, b, c, d, e in one transformation without having to split it into 3?
Also, I have no idea how to do f. Any suggestions?
Finally, I am new to R. Anything else I can improve about my code?
There are several things you can improve:
fread will return a data.table, so no need to wrap it in data.table. You can check with class(DT).
Use the na.strings parameter when reading in the data. See below for an example.
Summarise with:
DT[, .(a = a[.N],
b = b[.N],
c = c[1],
d = sum(d),
e = sum(e),
f = unique(na.omit(f)))
, by = ID]
you will then get:
ID a b c d e f
1: 1 11 111 1000 30030 300300 5/07/1977
2: 2 22 222 2000 20000 200000 6/02/1980
3: 3 33 333 3000 60010 600100 20/12/1978
4: 4 44 444 4000 240150 2401500 7/06/1944
5: 5 55 555 5000 100010 1000100 31/05/1976
Some explanations & other notes:
Subsetting with [1] will give you the first value of a group. You could also use the first-function which is optimized in data.table, and thus faster.
Subsetting with [.N] will give you the last value of a group. You could also use the last-function which is optimized in data.table, and thus faster.
Don't use variable names that are also functions in R (in this case, don't use c as a variable name). See also ?c for an explanation of what the c-function does.
For summarising the f-variable, I used unique in combination with na.omit. If there is more than one unique date by ID, you could also use for example na.omit(f)[1].
If speed is an issue, you could optimize the above to (thx to #Frank):
DT[order(f)
, .(a = last(a),
b = last(b),
c = first(c),
d = sum(d),
e = sum(e),
f = first(f))
, by = ID]
Ordering by f will put NA-values last. As a result now the internal GForce-optimization is used for all calculations.
Used data:
DT <- fread("ID a b c d e f
1 10 100 1000 10000 100000 ?
1 10 100 1001 10010 100100 5/07/1977
1 11 111 1002 10020 100200 5/07/1977
2 22 222 2000 20000 200000 6/02/1980
3 33 333 3000 30000 300000 20/12/1978
3 33 333 3001 30010 300100 ?
4 40 400 4000 40000 400000 ?
4 40 400 4001 40010 400100 ?
4 40 400 4002 40020 400200 7/06/1944
4 44 444 4003 40030 400300 ?
4 44 444 4004 40040 400400 ?
4 44 444 4005 40050 400500 ?
5 55 555 5000 50000 500000 31/05/1976
5 55 555 5001 50010 500100 31/05/1976", na.strings='?')
We can use tidyverse. After grouping by 'ID', we summarise the columns based on the first or last observation
library(dplyr)
DT %>%
group_by(ID) %>%
summarise(a = last(a),
b = last(b),
c = first(c),
d = sum(d),
e = sum(e),
f = f[f!="?"][1])
# A tibble: 5 × 7
# ID a b c d e f
# <int> <int> <int> <int> <int> <int> <chr>
#1 1 11 111 1000 30030 300300 5/07/1977
#2 2 22 222 2000 20000 200000 6/02/1980
#3 3 33 333 3000 60010 600100 20/12/1978
#4 4 44 444 4000 240150 2401500 7/06/1944
#5 5 55 555 5000 100010 1000100 31/05/1976

Replace missing values (NA) in one data set with values from another where columns match

I have a data frame (datadf) with 3 columns, 'x', 'y, and z. Several 'x' values are missing (NA). 'y' and 'z' are non measured variables.
x y z
153 a 1
163 b 1
NA d 1
123 a 2
145 e 2
NA c 2
NA b 1
199 a 2
I have another data frame (imputeddf) with the same three columns:
x y z
123 a 1
145 a 2
124 b 1
168 b 2
123 c 1
176 c 2
184 d 1
101 d 2
I wish to replace NA in 'x' in 'datadf' with values from 'imputeddf' where 'y' and 'z' matches between the two data sets (each combo of 'y' and 'z' has its own value of 'x' to fill in).
The desired result:
x y z
153 a 1
163 b 1
184 d 1
123 a 2
145 e 2
176 c 2
124 b 1
199 a 2
I am trying things like:
finaldf <- datadf
finaldf$x <- if(datadf[!is.na(datadf$x)]){ddply(datadf, x=imputeddf$x[datadf$y == imputeddf$y & datadf$z == imputeddf$z])}else{datadf$x}
but it's not working.
What is the best way for me to fill in the NA in the using my imputed value df?
I would do this:
library(data.table)
setDT(DF1); setDT(DF2)
DF1[DF2, x := ifelse(is.na(x), i.x, x), on=c("y","z")]
which gives
x y z
1: 153 a 1
2: 163 b 1
3: 184 d 1
4: 123 a 2
5: 145 e 2
6: 176 c 2
7: 124 b 1
8: 199 a 2
Comments. This approach isn't so great, since it merges the whole of DF1, while we only need to merge the subset where is.na(x). Here, the improvement looks like (thanks, #Arun):
DF1[is.na(x), x := DF2[.SD, x, on=c("y", "z")]]
This way is analogous to #RHertel's answer.
From #Jakob's comment:
does this work for more than one x variable? If I want to fill up entire datasets with several columns?
You can enumerate the desired columns:
DF1[DF2, `:=`(
x = ifelse(is.na(x), i.x, x),
w = ifelse(is.na(w), i.w, w)
), on=c("y","z")]
The expression could be constructed using lapply and substitute, probably, but if the set of columns is fixed, it might be cleanest just to write it out as above.
Here's an alternative with base R:
df1[is.na(df1$x),"x"] <- merge(df2,df1[is.na(df1$x),][,c("y","z")])$x
> df1
# x y z
#1 153 a 1
#2 163 b 1
#3 124 b 1
#4 123 a 2
#5 145 e 2
#6 176 c 2
#7 184 d 1
#8 199 a 2
A dplyr solution, conceptually identical to the answers above. To pull out just the rows of imputeddf that correspond to NAs in datadf, use semi_join. Then, use another join to match back to datadf. (This step is not very clean, unfortunately.)
library(dplyr)
replacement_rows <- imputeddf %>%
semi_join(datadf %>% filter(is.na(x)), by = c("y", "z"))
datadf <- datadf %>%
left_join(replacement_rows, by = c("y", "z")) %>%
mutate(x = if_else(is.na(x.x), x.y, x.x)) %>%
select(x, y, z)
This gets what you want:
> datadf
# A tibble: 8 x 3
x y z
<dbl> <chr> <dbl>
1 153 a 1
2 163 b 1
3 184 d 1
4 123 a 2
5 145 e 2
6 176 c 2
7 124 b 1
8 199 a 2
In dplyr, you can use rows_patch to update NAs:
rows_patch(datadf, imputeddf, by = c("y", "z"), unmatched = "ignore")
# x y z
# 1 153 a 1
# 2 163 b 1
# 3 184 d 1
# 4 123 a 2
# 5 145 e 2
# 6 176 c 2
# 7 124 b 1
# 8 199 a 2
data:
datadf <- read.table(header = T, text = "x y z
153 a 1
163 b 1
NA d 1
123 a 2
145 e 2
NA c 2
NA b 1
199 a 2")
imputeddf <- read.table(header = T, text = " x y z
123 a 1
145 a 2
124 b 1
168 b 2
123 c 1
176 c 2
184 d 1
101 d 2")

Use function like cumulative sum by group or by each list element in R

I have the following data:
col1 = c(rep("a",4),rep("b",8),rep("c",6), rep("d",2))
col2 = sample(-100:250, 20)
col3 = cumsum(col2)
data = data.table(col1, col2, col3)
and data.table:
col1 col2 col3
1: a 56 56
2: a 90 146
3: a 85 231
4: a 214 445
5: b -39 406
6: b 116 522
7: b 42 564
8: b 131 695
9: b 161 856
10: b 54 910
11: b 15 925
12: b 229 1154
13: c 166 1320
14: c 224 1544
15: c -53 1491
16: c 87 1578
17: c -100 1478
18: c -11 1467
19: d 28 1495
20: d 143 1638
As you see it's just grouped by col1. I'd like to make some calculation (like cumsum, count if, etc) based on groups in col1.
In the end I'd would like to have:
col1 colsum countif>0 countif<0
a 445 4 0
b 709 7 1
c 313 3 3
d 171 2 0
#commentators
Guys! Please ... I did two solutions, the first very unsightly (no sense to put it here, but is based on making a list and loop with calculation for each element of list) and second this is:
a1 = aggregate (col2 ~ col1, sum, date = date)
a2 = aggregate (col2> 0 ~ col1, sum, date = date)
a3 = aggregate (col2 <0 ~ col1, sum, date = date)
cbind (a1, a2 counfif_1 = [2], counfif_2 = a3 [2])
I'm looking just for something nice and cool.
data[, list(colsum = sum(col2),
`countif>0` = sum(col2 > 0),
`countif<0` = sum(col2 < 0)), by = col1]
## col1 colsum countif>0 countif<0
## 1: a 445 4 0
## 2: b 709 7 1
## 3: c 313 3 3
## 4: d 171 2 0
You can use dplyr to achieve something similar
library(dplyr)
set.seed(1)
col1 <- c(rep("a", 4), rep("b", 8), rep("c", 6), rep("d",2))
col2 <- sample(-100:250, 20)
data <- tbl_df(data.frame(col1, col2))
str(data)
## Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 20 obs. of 3 variables:
## $ col1: Factor w/ 4 levels "a","b","c","d": 1 1 1 1 2 2 2 2 2 2 ...
## $ col2: int -7 30 99 216 -31 210 225 127 115 -79 ...
data %>%
group_by(col1) %>%
summarise(colsum = sum(col2),
countifpos = sum(col2 > 0),
countifneg = sum(col2 < 0))
## Source: local data frame [4 x 4]
## col1 colsum countifpos countifneg
## 1 a 338 3 1
## 2 b 497 4 4
## 3 c 758 6 0
## 4 d 184 2 0
You can use tapply to get summaries by group
for instance:
this is where you define the metrics you are calculating
metrics = function(x) { c(sum(x), length(x[x<0]) , length(x[x>0]) )}
the you use the metrics function to calculate your metrics by group via a tapply function
tapply (data$col2, data$col1, metrics)
$a
[1] 241 -50 291
$b
[1] 526 -86 612
$c
[1] 483 -94 577
$d
[1] -88 -88 0
You can then convert this output into a data frame as requested

Resources