R copy row and divide value with duplicate ids - r

I have a dataframe like so:
id val
a 10
a 50
b 30
Now for every id, I want to divide val by the number of repetitions of id and copy the row just as many times. So the final dataframe will become like so:
id val
a 5
a 5
a 25
a 25
b 30
Please note that the duplicate ids may not be consecutive.
How can I achieve this?

One dplyr option could be:
df %>%
group_by(id) %>%
mutate(val = val/n()) %>%
uncount(n())
id val
<chr> <dbl>
1 a 5
2 a 5
3 a 25
4 a 25
5 b 30

Store the counts in a vector and use it to repeat the data.frame:
df = data.frame(id=c("a","a","b"),val=c(10,50,30))
df$id = as.character(df$id)
n = table(df$id)
with(df,data.frame(id=rep(id,n[id]),val=rep(val/n[id],n[id])))
id val
1 a 5
2 a 5
3 a 25
4 a 25
5 b 30

Using tapply and stack.
stack(with(d, tapply(val, id, function(x) rep(x/length(x), each=length(x)))))
# values ind
# 1 5 a
# 2 5 a
# 3 25 a
# 4 25 a
# 5 30 b
Data:
d <- structure(list(id = c("a", "a", "b"), val = c(10L, 50L, 30L)), row.names = c(NA,
-3L), class = "data.frame")

Related

how to separate 2 numbers within a column in R

In R, I want to separate numbers that are in the same column. My data appear like this:
id time
1 1,2
2 3,4
3 4,5,6
I want it to appear like this:
1 1
1 2
2 3
2 4
3 4
3 5
3 6
Though not shown, there are different iterations of time that vary depending on the id. For example:
4 1,6,7
5 1,3,6
6 1,4,5
7 1,3,5
8 2,3,4
There are 100 ids and the time column has different #s that vary in order as shown above.
Does anyone have advice to do this?
An option with separate_rows
library(dplyr)
library(tidyr)
df %>%
separate_rows(time, sep = "(?<=.)(?=.)", convert = TRUE)
# A tibble: 4 x 2
# id time
# <dbl> <int>
#1 1 1
#2 1 2
#3 2 3
#4 2 4
data
df <- structure(list(id = c(1, 2), time = c(12, 34)), class = "data.frame",
row.names = c(NA,
-2L))
Using tidyverse you could try the following. Make sure time is character type, and use strsplit to split up into single characters.
library(tidyverse)
df %>%
mutate(time = strsplit(as.character(time), ",")) %>%
unnest(cols = time)
Or you can just use separate_rows and indicate comma as separator:
df %>%
separate_rows(time, sep = ',')
Or in base R you could try this:
s <- strsplit(df$time, ',', fixed = T)
data.frame(id = unlist(s), time = rep(df$id, lengths(s)))
Output
# A tibble: 10 x 2
id time
<int> <chr>
1 1 1
2 1 2
3 2 3
4 2 4
5 3 4
6 3 5
7 3 6
8 4 1
9 4 6
10 4 7
Data
df <- structure(list(id = 1:4, time = c("1,2", "3,4", "4,5,6", "1,6,7"
)), class = "data.frame", row.names = c(NA, -4L))

Find time-lag between groups in a data.frame

Let's suppose I want to estimate the time lag between two groups within a data.frame.
Here an example of my data:
df_1 = data.frame(time = c(1,3,5,6,8,11,15,16,18,20), group = 'a') # create group 'a' data
df_2 = data.frame(time = c(2,7,10,13,19,25), group = 'b') # create group 'b' data
df = rbind(df_1, df_2) # merge groups
df = df[with(df, order(time)), ] # order by time
rownames(df) = NULL #remove row names
> df
time group
1 1 a
2 2 b
3 3 a
4 5 a
5 6 a
6 7 b
7 8 a
8 10 b
9 11 a
10 13 b
11 15 a
12 16 a
13 18 a
14 19 b
15 20 a
16 25 b
Now I need to subtract the time observation from group b to the time observation from group a.
i.e. 2-1, 7-6, 10-8, 13-11, 19-18 and 25-20.
# Expected output
> out
[1] 1 1 2 2 1 5
How can I achieve this?
We can find indices of b and subtract the time value from it's previous index.
inds <- which(df$group == "b")
df$time[inds] - df$time[inds - 1]
#[1] 1 1 2 2 1 5
Here's a tidyverse solution. First add a column by basic logic of the appearance of group b with transmute and a subtraction of the preceding column. Then filter to just the results, and convert to vector with deframe
library(tidyverse)
df %>%
transmute(result = if_else(group == "b", time - lag(time), 0)) %>%
filter(result != 0) %>%
deframe()
result:
[1] 1 1 2 2 1 5

R insert week number from vector and perform na.locf afterwards

For a dataframe similar to below (but much larger obviously)) I want to add missing week numbers from a vector ( vector is named weeks below). In the end, each value for var1 should have 4 rows consisting of week 40 - 42 so the value inserted for week can be different for different values of var1. Initially the inserted rows can have value NA but as a second step I would like to perform na.locf for each value of var1. does anyone know how to do this?
Data frame example:
dat <- data.frame(var1 = rep(c('a','b','c','d'),3),
week = c(rep(40,4),rep(41,4),rep(42,4)),
value = c(2,3,3,2,4,5,5,6,8,9,10,10))
dat <- dat[-c(6,11), ]
weeks <- c(40:42)
Like this?
dat %>%
tidyr::complete(var1,week) %>%
group_by(var1) %>%
arrange(week) %>%
tidyr::fill(value)
# A tibble: 12 x 3
# Groups: var1 [4]
var1 week value
<fct> <dbl> <dbl>
1 a 40 2
2 a 41 4
3 a 42 8
4 b 40 3
5 b 41 3
6 b 42 9
7 c 40 3
8 c 41 5
9 c 42 5
10 d 40 2
11 d 41 6
12 d 42 10
Hi have you considered tidyr::complete and dplyr::fill().
library(dplyr)
library(tidyr)
complete(dat, week = 40:42, var1 = c("a", "b", "c", "d")) %>% fill(value, .direction =
"down")

Subset dataframe based on Values in second dataframe

I have one dataframe, df, that has two columns as such:
> head(df1[,c(10,11)])
ColA ColB
1 12 20
2 7 5
3 32 38
4 37 46
5 15 15
6 4 4
I have a second dataframe, also with 2 columns with matching names. Instead, there are only two numbers, as such:
> head(df2)
ColA ColB
1 50 30
I want to subset values from df1 based on the value in the corresponding column from df2 . Doing this manually would look like this:
colA_vector <- df1[df1$colA < 50,]
colB_vector <- df1[df1$ColB < 30,]
How can I do so in a more general purpose way? I do not want to hardcode anything. The column name "ColA" or "ColB" could be anything (so solutions requiring those column names won't really work).
Thank you.
In base R we could do:
nms <- intersect(names(df1), names(df2))
df1[do.call(`&`, Map(`<`, df1[nms], df2[nms])),]
# ColA ColB
# 1 12 20
# 2 7 5
# 5 15 15
# 6 4 4
Or just df1[do.call('&', Map('<', df1, df2)),] if both data.frames have the same order of columns and same names.
Using the package fuzzyjoin might be more readable however:
library(fuzzy_join)
fuzzy_semi_join(df1, df2, match_fun = `<`)
# ColA ColB
# 1 12 20
# 2 7 5
# 5 15 15
# 6 4 4
data
df1 <- read.table(text="
ColA ColB
1 12 20
2 7 5
3 32 38
4 37 46
5 15 15
6 4 4",h=T,strin=F)
df2 <- read.table(text="ColA ColB
1 50 30",h=T,strin=F)
Create a function if we want to do the same task repeatedly
f1 <- function(dat1, dat2, colName) {
dat1[dat1[[colName]] < dat2[[colName]],]
}
f1(df1, df2, "ColA")
# ColA ColB
#1 12 20
#2 7 5
#3 32 38
#4 37 46
#5 15 15
#6 4 4
f1(df1, df2, "ColB")
# ColA ColB
#1 12 20
#2 7 5
#5 15 15
#6 4 4
data
df1 <- structure(list(ColA = c(12L, 7L, 32L, 37L, 15L, 4L), ColB = c(20L,
5L, 38L, 46L, 15L, 4L)), class = "data.frame", row.names = c(NA,
-6L))
df2 <- structure(list(ColA = 50L, ColB = 30L),
class = "data.frame", row.names = "1")
Using dplyr:
df1 %>%
filter(df1[,1] < df2[,1])
ColA ColB
1 12 20
2 7 5
3 32 38
4 37 46
5 15 15
6 4 4
df1 %>%
filter(df1[,2] < df2[,2])
ColA ColB
1 12 20
2 7 5
3 15 15
4 4 4
Subsetting based on both columns simultaneously:
df1 %>%
filter(df1[,1] < df2[,1] & df1[,2] < df2[,2])
ColA ColB
1 12 20
2 7 5
3 15 15
4 4 4
If you don't want to use the fuzzyjoin join package or make your own function, you can just repeat the second dataframe.
df1 <- data.frame("ColA" = c(12, 7, 32),
"ColB" = c(20, 5, 38))
df2 <- data.frame("ColA" = 50,
"ColB" = 30)
n <- nrow(df1)
df2_new <- do.call("rbind", replicate(n, df2, simplify = FALSE))
df1_which <- as.data.frame(df1 < df2_new)
colA_vector <- df1[df1_which$ColA, "ColA"]
colB_vector <- df1[df1_which$ColB, "ColB"]
You can try a tidyverse funtion. Result is a list of the filtered data.frames.
foo <- function(x, y, ColA, ColB){
require(tidyverse)
var1 <- quo_name(ColA)
var2 <- quo_name(ColB)
x %>%
select(a=!!var1, b=!!var2) %>%
mutate(colA_vector= a < y[[ColA]]) %>%
mutate(colB_vector= b < y[[ColB]]) %>%
gather(k, v, -a, -b) %>%
filter(v) %>%
split(.$k) %>%
map(~select(.,-v,-k))
}
foo(df1, df2, "ColA", "ColB")
$colA_vector
a b
1 12 20
2 7 5
3 32 38
4 37 46
5 15 15
6 4 4
$colB_vector
a b
7 12 20
8 7 5
9 15 15
10 4 4

Merge linear intervals

I have a data.frame which consists of linear intervals for each id:
df <- data.frame(id = c(rep("a",3),rep("b",4),rep("d",4)),
start = c(3,4,10,5,6,9,12,8,12,15,27),
end = c(7,8,12,8,9,13,13,10,15,26,30))
I'm looking for an efficient function that will unite all intersecting intervals per each id. For df the result ill be:
res.df <- data.frame(id = c("a","a","b","d","d","d"),
start = c(3,10,5,8,12,27),
end = c(8,12,13,10,26,30))
For which eventually I'll be able to sum up all the united intervals per each id to get their combined length:
sapply(unique(res.df$id), function(x) sum(res.df$end[which(res.df$id == x)]-res.df$start[which(res.df$id == x)]+1))
#source("https://bioconductor.org/biocLite.R")
#biocLite("IRanges")
library(IRanges)
df1 <- as(df, "RangedData")
as.data.frame(reduce(df1, by = "id", min.gapwidth = 0.5))
# space start end width id
#1 1 3 8 6 a
#2 1 10 12 3 a
#3 1 5 13 9 b
#4 1 8 10 3 d
#5 1 12 26 15 d
#6 1 27 30 4 d

Resources