R reshape dataset - r

I am trying to reshape to wide a dataset using R, this is the code, I would like to have df2 but I am struggling a bit.
value <- seq(1,20,1)
country <- c("AT","AT","AT","AT",
"BE","BE","BE","BE",
"CY","CY","CY", "CY",
"DE","DE","DE","DE",
"EE", "EE","EE","EE")
df <- data.frame(country, value)
df
# country value
# 1 AT 1
# 2 AT 2
# 3 AT 3
# 4 AT 4
# 5 BE 5
# 6 BE 6
# 7 BE 7
# 8 BE 8
# 9 CY 9
# 10 CY 10
# 11 CY 11
# 12 CY 12
# 13 DE 13
# 14 DE 14
# 15 DE 15
# 16 DE 16
# 17 EE 17
# 18 EE 18
# 19 EE 19
# 20 EE 20
#new dataset
AT <- seq(1,4,1)
BE <- seq(5,8,1)
# etc
df2 <- data.frame(AT, BE)
df2
# AT BE
# 1 1 5
# 2 2 6
# 3 3 7
# 4 4 8
Any help?

Using the tidyverse (dplyr and tidyr)
df %>% group_by(country) %>%
mutate(row=row_number()) %>%
pivot_wider(names_from = country,values_from=value)
# A tibble: 4 x 6
row AT BE CY DE EE
<int> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 1 5 9 13 17
2 2 2 6 10 14 18
3 3 3 7 11 15 19
4 4 4 8 12 16 20

We can reshape to 'wide' format with pivot_wider
library(dplyr)
library(tidyr)
df %>%
group_by(country) %>%
mutate(rn = row_number()) %>%
pivot_wider(names_from = country, values_from = value)
# A tibble: 4 x 6
# rn AT BE CY DE EE
# <int> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 1 1 5 9 13 17
#2 2 2 6 10 14 18
#3 3 3 7 11 15 19
#4 4 4 8 12 16 20
Or using base R
out <- unstack(df, value ~ country)
str(out)
#'data.frame': 4 obs. of 5 variables:
# $ AT: num 1 2 3 4
# $ BE: num 5 6 7 8
# $ CY: num 9 10 11 12
# $ DE: num 13 14 15 16
# $ EE: num 17 18 19 20

Related

Get frequency and first occurrence for each item of a column in a data frame

In R, I'm trying to get the frequency and the first occurence for each item of a column in a data frame.
I have this:
df_input <- data.frame(observation_source = c("AB","CD","EF","GH","IJ","KL","MN"), observation_value = c(15,17,17,17,21,32,32))
observation_source observation_value
1 AB 15
2 CD 17
3 EF 17
4 GH 17
5 IJ 21
6 KL 32
7 MN 32
And I'm trying to get this:
observation_source observation_value value_frequency value_first_row
1 AB 15 1 1
2 CD 17 3 2
3 EF 17 3 2
4 GH 17 3 2
5 IJ 21 1 5
6 KL 32 2 6
7 MN 32 2 6
Such that, on row 4 for example, the value 17 occurs 3 times in total and occurs on row 2 for the first time.
I know how to do this with a for loop, but it gets extremely slow as the number of row increases (e.g. 100,000). Any idea how else I can do that? Many thanks!!
You could use add_count and match for the both goals:
library(dplyr)
df_input %>%
add_count(observation_value, name = "value_frequency") %>%
mutate(value_first_row = observation_value %>% match(., .))
# observation_source observation_value value_frequency value_first_row
# 1 AB 15 1 1
# 2 CD 17 3 2
# 3 EF 17 3 2
# 4 GH 17 3 2
# 5 IJ 21 1 5
# 6 KL 32 2 6
# 7 MN 32 2 6
library(dplyr)
df_input %>%
mutate(
value_first_row = row_number()
) %>%
group_by(observation_value) %>%
mutate(
value_frequency = n(),
value_first_row = min(value_first_row)
)
# # A tibble: 7 × 4
# # Groups: observation_value [4]
# observation_source observation_value value_first_row value_frequency
# <chr> <dbl> <int> <int>
# 1 AB 15 1 1
# 2 CD 17 2 3
# 3 EF 17 2 3
# 4 GH 17 2 3
# 5 IJ 21 5 1
# 6 KL 32 6 2
# 7 MN 32 6 2
You can adjust the column order too, if that's important, with ... %>% relocate(value_first_row, .after = last_col())
Using base R
transform(df_input, value_frequency = ave(observation_value,
observation_value, FUN = length),
value_first_row = ave(seq_along(observation_value),
observation_value, FUN = min))
-output
observation_source observation_value value_frequency value_first_row
1 AB 15 1 1
2 CD 17 3 2
3 EF 17 3 2
4 GH 17 3 2
5 IJ 21 1 5
6 KL 32 2 6
7 MN 32 2 6

Sum column over specific rownumbers in grouped dataframe in R

I have a dataframe like this:
df = data.frame(
x = 1:100,
y = rep(1:10, times = 10, each = 10)
) %>%
group_by(y)
And I would like to compute the sum of x from the 3rd to the 6th row of each group of y.
I think this should be easy, but I just can not figure it out at the moment.
In pseudocode I imagine something like this:
df %>%
mutate(
sum(x, ifelse(between(row_number(), 3,6)))
)
But this of course does not work. I would like to solve it with some dplyr-function, but also in base R I cannot think of a fast solution.
For the first group the sum would be 3+4+5+6....
One option could be:
df %>%
group_by(y) %>%
mutate(z = sum(x[row_number() %in% 3:6]))
x y z
<int> <int> <int>
1 1 1 18
2 2 1 18
3 3 1 18
4 4 1 18
5 5 1 18
6 6 1 18
7 7 1 18
8 8 1 18
9 9 1 18
10 10 1 18
You could also do this with filter() and summarise() and obtain a group-wise summary:
df %>%
group_by(y) %>%
mutate(rn = 1:n()) %>%
filter(rn %in% 3:6) %>%
summarise(x_sum = sum(x))
# A tibble: 10 x 2
y x_sum
<int> <int>
1 1 18
2 2 58
3 3 98
4 4 138
5 5 178
6 6 218
7 7 258
8 8 298
9 9 338
10 10 378
Update: If you want to sum multiple sequences from x then you can sum by index:
df %>%
group_by(y) %>%
mutate(sum_row3to6 = sum(x[3:6]),
sum_row1to4 = sum(x[1:4])
)
Output:
x y sum_row3to6 sum_row1to4
<int> <int> <int> <int>
1 1 1 18 10
2 2 1 18 10
3 3 1 18 10
4 4 1 18 10
5 5 1 18 10
6 6 1 18 10
7 7 1 18 10
8 8 1 18 10
9 9 1 18 10
10 10 1 18 10
First answer:
We could use slice summarise
library(dplyr)
df %>%
group_by(y) %>%
slice(3:6) %>%
summarise(sum = sum(x))
Output:
y sum
<int> <int>
1 1 18
2 2 58
3 3 98
4 4 138
5 5 178
6 6 218
7 7 258
8 8 298
9 9 338
10 10 378
data.table
library(data.table)
df = data.frame(
x = 1:100,
y = rep(1:10, times = 10, each = 10)
)
setDT(df)[rowid(y) %in% 3:6, list(sum_x = sum(x)), by = y][]
#> y sum_x
#> 1: 1 18
#> 2: 2 58
#> 3: 3 98
#> 4: 4 138
#> 5: 5 178
#> 6: 6 218
#> 7: 7 258
#> 8: 8 298
#> 9: 9 338
#> 10: 10 378
Created on 2021-05-21 by the reprex package (v2.0.0)

How to get the lowest and highest values for interval class

My data looks like the following data:
df<-read.table(text = "temp
12
15
12
6
9
11
15
14
14
16
14
14
11
12
13
14
10
12
12
14
9
13
12
15
11
11
12
12
10
11",header=TRUE)
I want to get the lowest and highest levels for temp to calculate cumulative.
I have done the following codes:
library(purrr)
library(dplyr)
map(names(df),~df %>%
count(!!rlang::sym(.x)%>%
mutate(cum=cumsum(temp)/sum(temp)))
AS you can see, this gives us the temps of 6,9,10,11,12,13,14,15,16, but 7 and 8 is lacking.
I want to have the following output:
temp n cum
6 x x
7 0 x
8 0 x
9 x x
10 x x
11 x x
12 x x
13 x x
14 x x
15 x x
16 x x
We can use complete to fill the missing sequence in temp and fill the cum value.
library(dplyr)
library(tidyr)
df %>%
count(temp) %>%
mutate(cum=cumsum(n)/sum(n)) %>%
complete(temp = seq(min(temp), max(temp)), fill = list(n = 0)) %>%
fill(cum)
# A tibble: 11 x 3
# temp n cum
# <int> <dbl> <dbl>
# 1 6 1 0.0333
# 2 7 0 0.0333
# 3 8 0 0.0333
# 4 9 2 0.1
# 5 10 2 0.167
# 6 11 5 0.333
# 7 12 8 0.6
# 8 13 2 0.667
# 9 14 6 0.867
#10 15 3 0.967
#11 16 1 1
In base R you could use table to get df2, match the frequencies within a new data.frame out of the temperature range, where you set NA to zero, ans calculate the cumsum.
df2 <- data.frame(table(df$temp))
rg <- range(df$temp)
res <- within(data.frame(temp=rg[1]:rg[2]), {
n <- df2[match(temp, df2$Var1), "Freq"]
n[is.na(n)] <- 0
cum=cumsum(n/sum(n))
})[c(1, 3, 2)]
res
# temp n cum
# 1 6 1 0.03333333
# 2 7 0 0.03333333
# 3 8 0 0.03333333
# 4 9 2 0.10000000
# 5 10 2 0.16666667
# 6 11 5 0.33333333
# 7 12 8 0.60000000
# 8 13 2 0.66666667
# 9 14 6 0.86666667
# 10 15 3 0.96666667
# 11 16 1 1.00000000

R how to fill in NA with rules

data=data.frame(person=c(1,1,1,2,2,2,2,3,3,3,3),
t=c(3,NA,9,4,7,NA,13,3,NA,NA,12),
WANT=c(3,6,9,4,7,10,13,3,6,9,12))
So basically I am wanting to create a new variable 'WANT' which takes the PREVIOUS value in t and ADDS 3 to it, and if there are many NA in a row then it keeps doing this. My attempt is:
library(dplyr)
data %>%
group_by(person) %>%
mutate(WANT_TRY = fill(t) + 3)
Here's one way -
data %>%
group_by(person) %>%
mutate(
# cs = cumsum(!is.na(t)), # creates index for reference value; uncomment if interested
w = case_when(
# rle() gives the running length of NA
is.na(t) ~ t[cumsum(!is.na(t))] + 3*sequence(rle(is.na(t))$lengths),
TRUE ~ t
)
) %>%
ungroup()
# A tibble: 11 x 4
person t WANT w
<dbl> <dbl> <dbl> <dbl>
1 1 3 3 3
2 1 NA 6 6
3 1 9 9 9
4 2 4 4 4
5 2 7 7 7
6 2 NA 10 10
7 2 13 13 13
8 3 3 3 3
9 3 NA 6 6
10 3 NA 9 9
11 3 12 12 12
Here is another way. We can do linear interpolation with the imputeTS package.
library(dplyr)
library(imputeTS)
data2 <- data %>%
group_by(person) %>%
mutate(WANT2 = na.interpolation(WANT)) %>%
ungroup()
data2
# # A tibble: 11 x 4
# person t WANT WANT2
# <dbl> <dbl> <dbl> <dbl>
# 1 1 3 3 3
# 2 1 NA 6 6
# 3 1 9 9 9
# 4 2 4 4 4
# 5 2 7 7 7
# 6 2 NA 10 10
# 7 2 13 13 13
# 8 3 3 3 3
# 9 3 NA 6 6
# 10 3 NA 9 9
# 11 3 12 12 12
This is harder than it seems because of the double NA at the end. If it weren't for that, then the following:
ifelse(is.na(data$t), c(0, data$t[-nrow(data)])+3, data$t)
...would give you want you want. The simplest way, that uses the same logic but doesn't look very clever (sorry!) would be:
.impute <- function(x) ifelse(is.na(x), c(0, x[-length(x)])+3, x)
.impute(.impute(data$t))
...which just cheats by doing it twice. Does that help?
You can use functional programming from purrr and "NA-safe" addition from hablar:
library(hablar)
library(dplyr)
library(purrr)
data %>%
group_by(person) %>%
mutate(WANT2 = accumulate(t, ~.x %plus_% 3))
Result
# A tibble: 11 x 4
# Groups: person [3]
person t WANT WANT2
<dbl> <dbl> <dbl> <dbl>
1 1 3 3 3
2 1 NA 6 6
3 1 9 9 9
4 2 4 4 4
5 2 7 7 7
6 2 NA 10 10
7 2 13 13 13
8 3 3 3 3
9 3 NA 6 6
10 3 NA 9 9
11 3 12 12 12

Collapsing a data.frame by group and interval coordinates

I have a data.frame which specifies linear intervals (along chromosomes), where each interval is assigned to a group:
df <- data.frame(chr = c(rep("1",5),rep("2",4),rep("3",5)),
start = c(seq(1,50,10),seq(1,40,10),seq(1,50,10)),
end = c(seq(10,50,10),seq(10,40,10),seq(10,50,10)),
group = c(c("g1.1","g1.1","g1.2","g1.3","g1.1"),c("g2.1","g2.2","g2.3","g2.2"),c("g3.1","g3.2","g3.2","g3.2","g3.3")),
stringsAsFactors = F)
I'm looking for a fast way to collapse df by chr and by group such that consecutive intervals along a chr that are assigned to the same group are combined and their start and end coordinates are modified accordingly.
Here's the desired outcome for this example:
res.df <- data.frame(chr = c(rep("1",4),rep("2",4),rep("3",3)),
start = c(c(1,21,31,41),c(1,11,21,31),c(1,11,41)),
end = c(c(20,30,40,50),c(10,20,30,40),c(10,40,50)),
group = c("g1.1","g1.2","g1.3","g1.1","g2.1","g2.2","g2.3","g2.2","g3.1","g3.2","g3.3"),
stringsAsFactors = F)
Edit: To account for the consecutive requirement you can use the same approach as earlier but add an extra grouping variable based on consecutive values.
library(dplyr)
df %>%
group_by(chr, group, temp.grp = with(rle(group), rep(seq_along(lengths), lengths))) %>%
summarise(start = min(start),
end = max(end)) %>%
arrange(chr, start) %>%
select(chr, start, end, group)
# A tibble: 11 x 4
# Groups: chr, group [9]
chr start end group
<chr> <dbl> <dbl> <chr>
1 1 1 20 g1.1
2 1 21 30 g1.2
3 1 31 40 g1.3
4 1 41 50 g1.1
5 2 1 10 g2.1
6 2 11 20 g2.2
7 2 21 30 g2.3
8 2 31 40 g2.2
9 3 1 10 g3.1
10 3 11 40 g3.2
11 3 41 50 g3.3
A different tidyverse approach could be:
df %>%
gather(var, val, -c(chr, group)) %>%
group_by(chr, group) %>%
filter(val == min(val) | val == max(val)) %>%
spread(var, val)
chr group end start
<chr> <chr> <dbl> <dbl>
1 1 g1.1 20 1
2 1 g1.2 30 21
3 1 g1.3 50 31
4 2 g2.1 10 1
5 2 g2.2 20 11
6 2 g2.3 40 21
7 3 g3.1 10 1
8 3 g3.2 40 11
9 3 g3.3 50 41
Or:
df %>%
group_by(chr, group) %>%
summarise_all(funs(min, max)) %>%
select(-end_min, -start_max)
chr group start_min end_max
<chr> <chr> <dbl> <dbl>
1 1 g1.1 1 20
2 1 g1.2 21 30
3 1 g1.3 31 50
4 2 g2.1 1 10
5 2 g2.2 11 20
6 2 g2.3 21 40
7 3 g3.1 1 10
8 3 g3.2 11 40
9 3 g3.3 41 50
A solution, using also rleid() from data.table, to the updated post could be:
df %>%
group_by(chr, group, group2 = rleid(group)) %>%
summarise_all(funs(min, max)) %>%
select(-end_min, -start_max)
chr group group2 start_min end_max
<chr> <chr> <int> <dbl> <dbl>
1 1 g1.1 1 1 20
2 1 g1.1 4 41 50
3 1 g1.2 2 21 30
4 1 g1.3 3 31 40
5 2 g2.1 5 1 10
6 2 g2.2 6 11 20
7 2 g2.2 8 31 40
8 2 g2.3 7 21 30
9 3 g3.1 9 1 10
10 3 g3.2 10 11 40
11 3 g3.3 11 41 50

Resources