Getting rows in data frame based on mutiple ranges in R - r

Let's say I have this table
a b
1 5 12
2 6 17
3 7 28
4 8 12
5 9 17
6 10 28
7 15 12
8 25 14
9 13 29
Also another table with index ranges:
start end
1 2 3
2 5 7
I want to get the rows in the first table based on the index ranges in the second table with a group name to differentiate, something like this:
a b group
2 6 17 1
3 7 28 1
5 9 17 2
6 10 28 2
7 15 12 2
how do I achieve this in R?

We can subset df1 taking the row index value from df2 using Map.
do.call(rbind, Map(function(x, y, z) transform(df1[x:y, ], group = z),
df2$start, df2$end, seq_len(nrow(df2))))
Or in purrr :
purrr::map2_dfr(df2$start, df2$end, ~df1[.x:.y, ], .id = "group")
# group a b
#1 1 6 17
#2 1 7 28
#3 2 9 17
#4 2 10 28
#5 2 15 12
data
df1 <- structure(list(a = c(5L, 6L, 7L, 8L, 9L, 10L, 15L, 25L, 13L),
b = c(12L, 17L, 28L, 12L, 17L, 28L, 12L, 14L, 29L)),
class = "data.frame", row.names = c(NA, -9L))
df2 <- structure(list(start = c(2L, 5L), end = c(3L, 7L)),
class = "data.frame", row.names = c(NA, -2L))

An option using data.table:
DT1[, rn := .I]
DT2[, g := .I]
DT1[, g := DT2[DT1, on=.(start<=rn, end>=rn), x.g]][
!is.na(g)]
output:
a b rn g
1: 6 17 2 1
2: 7 28 3 1
3: 9 17 5 2
4: 10 28 6 2
5: 15 12 7 2
data:
library(data.table)
DT1 <- fread("a b
5 12
6 17
7 28
8 12
9 17
10 28
15 12
25 14
13 29")
DT2 <- fread("start end
2 3
5 7")

Related

Reshape Wide to Long with 2 time variables

Though there is an abundance of 'wide to long' threads for R, I haven't found an answer that will help me with my issue. Any assistance is greatly appreciated!
Example of my dataframe (in wide format):
CODE NAME M_2010_1 M_2011_1 M_2012_1 M_2010_3 M_2011_3 M_2012_3
1 A 10 11 10 9 10 13
12 B 11 13 15 15 14 11
8 C 9 2 4 2 8 8
Desired dataframe (in long):
CODE NAME YEAR M1 M3
1 A 2010 10 9
1 A 2011 11 10
1 A 2012 10 13
12 B 2010 11 15
12 B 2011 13 14
12 B 2012 15 11
8 C 2010 9 2
8 C 2011 2 8
8 C 2012 4 8
Thanks in advance!
Data
df<-
structure(list(CODE = c(1L, 12L, 8L), NAME = c("A", "B", "C"),
M_2010_1 = c(10L, 11L, 9L), M_2011_1 = c(11L, 13L, 2L), M_2012_1 = c(10L,
15L, 4L), M_2010_3 = c(9L, 15L, 2L), M_2011_3 = c(10L, 14L,
8L), M_2012_3 = c(13L, 11L, 8L)), class = "data.frame", row.names = c(NA,
-3L))
Code
library(dplyr)
library(tidyr)
df %>%
pivot_longer(cols = -c(CODE,NAME),names_sep = "_",names_to = c("aux1","YEAR","aux2")) %>%
unite(aux,aux1,aux2,sep = "") %>%
pivot_wider(names_from = aux,values_from = value)
Output
# A tibble: 9 x 5
CODE NAME YEAR M1 M3
<int> <chr> <chr> <int> <int>
1 1 A 2010 10 9
2 1 A 2011 11 10
3 1 A 2012 10 13
4 12 B 2010 11 15
5 12 B 2011 13 14
6 12 B 2012 15 11
7 8 C 2010 9 2
8 8 C 2011 2 8
9 8 C 2012 4 8
A one liner using reshape which allows to define all in one.
reshape(dat, idv=1:2, var=list(3:5, 6:8), dir='long', timev='YEAR', times=2010:2012, v.n=c('M1', 'M2'))
# CODE NAME YEAR M1 M2
# 1.A.2010 1 A 2010 10 9
# 12.B.2010 12 B 2010 11 15
# 8.C.2010 8 C 2010 9 2
# 1.A.2011 1 A 2011 11 10
# 12.B.2011 12 B 2011 13 14
# 8.C.2011 8 C 2011 2 8
# 1.A.2012 1 A 2012 10 13
# 12.B.2012 12 B 2012 15 11
# 8.C.2012 8 C 2012 4 8
Data:
dat <- structure(list(CODE = c(1L, 12L, 8L), NAME = c("A", "B", "C"),
M_2010_1 = c(10L, 11L, 9L), M_2011_1 = c(11L, 13L, 2L), M_2012_1 = c(10L,
15L, 4L), M_2010_3 = c(9L, 15L, 2L), M_2011_3 = c(10L, 14L,
8L), M_2012_3 = c(13L, 11L, 8L)), class = "data.frame", row.names = c(NA,
-3L))
We could do this in pivot_longer after we rearrange the substring in the column names
library(dplyr)
library(stringr)
library(tidyr)
df1 %>%
rename_with(~ str_replace(.x, "_(\\d+)_(\\d+)", "\\2_\\1"),
starts_with("M_")) %>%
pivot_longer(cols = starts_with("M"),
names_to = c(".value", "year"), names_sep = "_")
-output
# A tibble: 9 × 5
CODE NAME year M1 M3
<int> <chr> <chr> <int> <int>
1 1 A 2010 10 9
2 1 A 2011 11 10
3 1 A 2012 10 13
4 12 B 2010 11 15
5 12 B 2011 13 14
6 12 B 2012 15 11
7 8 C 2010 9 2
8 8 C 2011 2 8
9 8 C 2012 4 8
data
df1 <- structure(list(CODE = c(1L, 12L, 8L), NAME = c("A", "B", "C"),
M_2010_1 = c(10L, 11L, 9L), M_2011_1 = c(11L, 13L, 2L), M_2012_1 = c(10L,
15L, 4L), M_2010_3 = c(9L, 15L, 2L), M_2011_3 = c(10L, 14L,
8L), M_2012_3 = c(13L, 11L, 8L)), class = "data.frame", row.names = c(NA,
-3L))

R: New column storing the name of any of the already existing ones based on condition

Say if we have a dataframe looking like this below:
a b c d
22 18 25 9
12 24 6 18
37 8 22 25
24 19 12 27
I would like to create two new columns out of these ones:
a) One column storing the name of the column in which each row gets its highest value.
b) Another one storing its highest value.
In other words, my desired output would look as follows:
a b c d max_col max_val
22 18 25 9 c 25
12 24 6 18 b 24
37 8 22 25 a 37
24 19 12 27 d 27
How should I do to retrieve this?
Does this work:
> library(dplyr)
> df %>% rowwise() %>% mutate(max_col = names(df)[which.max(c_across(a:d))], max_val = max(c_across(a:d)))
# A tibble: 4 x 6
# Rowwise:
a b c d max_col max_val
<dbl> <dbl> <dbl> <dbl> <chr> <dbl>
1 22 18 25 9 c 25
2 12 24 6 18 b 24
3 37 8 22 25 a 37
4 24 19 12 27 d 27
>
It can be also reached reshaping data and merging:
library(tidyverse)
#Code
newdf <- df %>% mutate(id=row_number()) %>%
left_join(
df %>% mutate(id=row_number()) %>%
pivot_longer(-id) %>%
group_by(id) %>% filter(value==max(value)[1]) %>%
rename(max_col=name,max_val=value)
) %>% select(-id)
Output:
a b c d max_col max_val
1 22 18 25 9 c 25
2 12 24 6 18 b 24
3 37 8 22 25 a 37
4 24 19 12 27 d 27
Some data used:
#Data
df <- structure(list(a = c(22L, 12L, 37L, 24L), b = c(18L, 24L, 8L,
19L), c = c(25L, 6L, 22L, 12L), d = c(9L, 18L, 25L, 27L)), class = "data.frame", row.names = c(NA,
-4L))
We can do this in a vectorized efficient way with max.col from base R - gets the position index of the max value in a row, which is used to extract the corresponding column name with [ , and pmax to return the max value per row
mcol <- names(df)[max.col(df, 'first')]
mval <- do.call(pmax, df)
df[c('max_col', 'max_val')] <- list(mcol, mval)
-output
df
# a b c d max_col max_val
#1 22 18 25 9 c 25
#2 12 24 6 18 b 24
#3 37 8 22 25 a 37
#4 24 19 12 27 d 27
Or using tidyverse, we can use the same max.col and pmax to get the column names of the max value per row and the max value of the row
library(dplyr)
library(purrr)
df %>%
mutate(max_col = names(cur_data())[max.col(cur_data(), 'first')],
max_val = invoke(pmax, select(cur_data(), where(is.numeric))))
-output
# a b c d max_col max_val
# 1 22 18 25 9 c 25
# 2 12 24 6 18 b 24
# 3 37 8 22 25 a 37
# 4 24 19 12 27 d 27
data
df <- structure(list(a = c(22L, 12L, 37L, 24L), b = c(18L, 24L, 8L,
19L), c = c(25L, 6L, 22L, 12L), d = c(9L, 18L, 25L, 27L)),
class = "data.frame", row.names = c(NA,
-4L))
Here is another base R option
inds <- max.col(df)
df <- cbind(df,
max_col = names(df)[inds],
max_val = df[cbind(seq_along(inds), inds)]
)
which gives
a b c d max_col max_val
1 22 18 25 9 c 25
2 12 24 6 18 b 24
3 37 8 22 25 a 37
4 24 19 12 27 d 27

How can I use merge so that I have data for all times?

I'm trying to change a data into which all entities have value for all possible times(months). Here's what I'm trying;
Class Value month
A 10 1
A 12 3
A 9 12
B 11 1
B 10 8
From the data above, I want to get the following data;
Class Value month
A 10 1
A NA 2
A 12 3
A NA 4
....
A 9 12
B 11 1
B NA 2
....
B 10 8
B NA 9
....
B NA 12
So I want to have all possible cells with through month from 1 to 12;
How can I do this? I'm right now trying it with merge function, but appreciate any other ways to approach.
We can use tidyverse
library(tidyverse)
df1 %>%
complete(Class, month = min(month):max(month)) %>%
select_(.dots = names(df1)) %>% #if we need to be in the same column order
as.data.frame() #if needed to convert to 'data.frame'
In base R using merge (where df is your data):
res <- data.frame(Class=rep(levels(df$Class), each=12), value=NA, month=1:12)
merge(df, res, by = c("Class", "month"), all.y = TRUE)[,c(1,3,2)]
# Class Value month
# 1 A 10 1
# 2 A NA 2
# 3 A 12 3
# 4 A NA 4
# 5 A NA 5
# 6 A NA 6
# 7 A NA 7
# 8 A NA 8
# 9 A NA 9
# 10 A NA 10
# 11 A NA 11
# 12 A 9 12
# 13 B 11 1
# 14 B NA 2
# 15 B NA 3
# 16 B NA 4
# 17 B NA 5
# 18 B NA 6
# 19 B NA 7
# 20 B 10 8
# 21 B NA 9
# 22 B NA 10
# 23 B NA 11
# 24 B NA 12
df <- structure(list(Class = structure(c(1L, 1L, 1L, 2L, 2L), .Label = c("A",
"B"), class = "factor"), Value = c(10L, 12L, 9L, 11L, 10L), month = c(1L,
3L, 12L, 1L, 8L)), .Names = c("Class", "Value", "month"), class = "data.frame", row.names = c(NA,
-5L))
To add to #akrun's answer, if you want to replace the NA values with 0, you can do the following:
library(dplyr)
library(tidyr)
df1 %>%
complete(Class, month = min(month):max(month)) %>%
mutate(Value = ifelse(is.na(Value),0,Value))

Combine two data frames considering levels of factor of one data frame and column name of another data frame using r

I need to create a new column for a existing data frame considering levels of factors. I have 2 data frames called dat_group and dat_prices. These data frames look like below.
dat_group
Group
1 A
2 A
3 A
4 A
5 A
6 A
7 A
8 A
9 A
10 A
11 C
12 C
13 C
14 C
15 C
16 C
17 C
18 C
19 C
20 C
21 B
22 B
23 B
24 B
25 B
26 B
27 B
28 B
29 B
30 B
dat_price
A B C
1 21 45 24
2 21 45 24
3 21 45 24
4 21 45 24
5 15 11 10
6 15 11 10
7 15 11 10
8 20 13 55
9 20 13 55
10 20 13 55
I need to paste the values of A,B and C columns considering the level in dat_group. The row sequence should be the same order. If I create new column to dat_group as "price"
dat_group$Price<-NA
Then the data frame should be like ;
Group Price
1 A 21
2 A 21
3 A 21
4 A 21
5 A 15
6 A 15
7 A 15
8 A 20
9 A 20
10 A 20
11 C 24
12 C 24
13 C 24
14 C 24
15 C 10
16 C 10
17 C 10
18 C 55
19 C 55
20 C 55
21 B 45
22 B 45
23 B 45
24 B 45
25 B 11
26 B 11
27 B 11
28 B 13
29 B 13
30 B 13
I tried to do this using some available examples e.g.1 e.g.2, but did not work.
Please could anybody help me. The two example data frames can be accessed in following codes. My actual data set has several 1000 rows.
dat_group<- structure(list(Group = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("A", "B", "C"), class = "factor")), .Names = "Group", class = "data.frame", row.names = c(NA,
-30L))
dat_price<-structure(list(A = c(21L, 21L, 21L, 21L, 15L, 15L, 15L, 20L,
20L, 20L), B = c(45L, 45L, 45L, 45L, 11L, 11L, 11L, 13L, 13L,
13L), C = c(24L, 24L, 24L, 24L, 10L, 10L, 10L, 55L, 55L, 55L)), .Names = c("A",
"B", "C"), class = "data.frame", row.names = c(NA, -10L))
library(data.table)
dat_price <- as.data.table(dat_price)
dat_price_new <- cbind(dat_price[, c(1,3), with = FALSE],
dat_price[, 2, with = FALSE])
melt(dat_price_new)
A more defensive solution to your problem at hand. Hopefully this will work even if all of your factor's levels are not in identical multiples.
library(dplyr); library(purrr); library(magrittr)
dat_group$original_order <- seq(1:nrow(dat_group))
dat_group %<>%
split(.$Group) %>%
map(~ mutate(., Price = rep(na.omit(dat_price[,unique(Group)]), n()/length(na.omit(dat_price[,unique(Group)]))))) %>%
bind_rows() %>%
arrange(original_order) %>%
select(-original_order)
dat_group
Group Price
1 A 21
2 A 21
3 A 21
4 A 21
5 A 15
6 A 15
7 A 15
8 A 20
9 A 20
10 A 20
11 C 24
12 C 24
13 C 24
14 C 24
15 C 10
16 C 10
17 C 10
18 C 55
19 C 55
20 C 55
21 B 45
22 B 45
23 B 45
24 B 45
25 B 11
26 B 11
27 B 11
28 B 13
29 B 13
30 B 13
Original (lazy) solution:
dat_group$Price <- rep(unlist(dat_price), length.out = nrow(dat_group))

Merge two tables in R; column names differ with A and B options

I have two datasets that I'm trying to merge together. The first one contains information for every test subject with a unique ID (in rows). The second set contains measurements for every test subject (in columns), however each subject was measured twice so the unique ID reads "IDa and IDb." I'd like to find a way to merge these two tables based on the unique ID, regardless of whether it is measurement A or B.
Here's a small sample of the 2 datasets, and a table of the intended output. Any help would be appreciated!
UniqueID Site State Age Height
Tree001 FK OR 23 70
Tree002 FK OR 45 53
Tree003 NM OR 35 84
UniqueID Tree001A Tree001B Tree002A Tree002B Tree003A Tree003B
1996 4 2
1997 7 8 7 3
1998 3 2 9 4 7
1999 11 9 2 12 3 13
2010 8 8 4 6 11 4
2011 10 5 6 3 8 9
UniqueID Tree001A Tree001B Tree002A Tree002B Tree003A Tree003B
Site FK FK FK FK NM NM
State OR OR OR OR OR OR
Age 23 23 45 45 35 35
Height 70 70 53 53 84 84
1996 4 2
1997 7 8 7 3
1998 3 2 9 4 7
1999 11 9 2 12 3 13
2010 8 8 4 6 11 4
2011 10 5 6 3 8 9
This can be one approach.
df1 <- structure(list(UniqueID = structure(1:3, .Label = c("Tree001",
"Tree002", "Tree003"), class = "factor"), Site = structure(c(1L,
1L, 2L), .Label = c("FK", "NM"), class = "factor"), State = structure(c(1L,
1L, 1L), .Label = "OR", class = "factor"), Age = c(23L, 45L,
35L), Height = c(70L, 53L, 84L)), .Names = c("UniqueID", "Site",
"State", "Age", "Height"), class = "data.frame", row.names = c(NA,
-3L))
df2 <- structure(list(UniqueID = c(1996L, 1997L, 1998L, 1999L, 2010L,
2011L), Tree001A = c(4L, 7L, 3L, 11L, 8L, 10L), Tree001B = c(NA,
8L, 2L, 9L, 8L, 5L), Tree002A = c(2L, 7L, 9L, 2L, 4L, 6L), Tree002B = c(NA,
NA, 4L, 12L, 6L, 3L), Tree003A = c(NA, 3L, 7L, 3L, 11L, 8L),
Tree003B = c(NA, NA, NA, 13L, 4L, 9L)), .Names = c("UniqueID",
"Tree001A", "Tree001B", "Tree002A", "Tree002B", "Tree003A", "Tree003B"
), class = "data.frame", row.names = c(NA, -6L))
> df1
UniqueID Site State Age Height
1 Tree001 FK OR 23 70
2 Tree002 FK OR 45 53
3 Tree003 NM OR 35 84
> df2
UniqueID Tree001A Tree001B Tree002A Tree002B Tree003A Tree003B
1 1996 4 <NA> 2 <NA> <NA> <NA>
2 1997 7 8 7 <NA> 3 <NA>
3 1998 3 2 9 4 7 <NA>
4 1999 11 9 2 12 3 13
5 2010 8 8 4 6 11 4
6 2011 10 5 6 3 8 9
# Use transpose function to change df1
df3 <- as.data.frame(t(df1[,-1]))
colnames(df3) <- df1[,1]
# Change rownames to UniqueID
df3$UniqueID <- rownames(df3)
# ROwnames to numeric
rownames(df3) <- c(1:4)
# Modify dataframe so that you have two columns for each subject
df3 <- df3[,c(4,1,1,2,2,3,3)]
colnames(df3) <- c("UniqueID", "Tree001A", "Tree001B", "Tree002A",
"Tree002B", "Tree003A", "Tree003B")
# Change classes of columns of df2 to factor
df2 <- data.frame(sapply(df2,function(x) class(x)<- as.factor(x)))
# Now combine two data frames
new <- rbind(df3,df2)
> new
UniqueID Tree001A Tree001B Tree002A Tree002B Tree003A Tree003B
1 Site FK FK FK FK NM NM
2 State OR OR OR OR OR OR
3 Age 23 23 45 45 35 35
4 Height 70 70 53 53 84 84
5 1996 4 <NA> 2 <NA> <NA> <NA>
6 1997 7 8 7 <NA> 3 <NA>
7 1998 3 2 9 4 7 <NA>
8 1999 11 9 2 12 3 13
9 2010 8 8 4 6 11 4
10 2011 10 5 6 3 8 9

Resources