I'm working with R
WHAT I HAVE:
ID_1 ID_2 Date x_1 y_2
1 12 3 2011-12-21 15 10
2 12 13 2011-12-22 50 40
3 3 12 2011-12-22 20 30
4 15 13 2011-12-23 30 20
...
and so on
TARGET:
ID_1 ID_2 Date x_1 y_2 XX_1 YY_2
1 12 3 2011-12-21 15 10 0 0
2 12 13 2011-12-22 50 40 15 0
3 3 12 2011-12-22 20 30 10 50
4 15 13 2011-12-23 30 20 0 40
...
and so on
I want to see in XX_1 and in YY_2 the values from the columns x_1 and y_2 corresponding to the previous values of ID_1 and ID1_2 in or "0" in case of no value is available before that date. I don't know how to handle the fact that different values could be in ID_1 and ID_2 (like IDs 3 and 12 in the example).
#Ekatef
ID1 AND ID2 (find match of the whole ID row, even if the order of IDs is switched):
ID_1 ID_2 Date x_1 y_2 XX_1 YY_2
1 12 3 2011-12-21 15 10 0 0
2 12 13 2011-12-22 50 40 0 0
3 3 12 2011-12-22 20 30 10 15
4 15 13 2011-12-23 30 20 0 0
5 12 13 2011-12-23 10 5 50 40
The OP has requested to copy the previous value for an ID (if any) to the appropriate new column.
This can solved by reshaping multiple columns simultaneously from wide to long format, finding the previous value by shifting / lagging, and reshaping back to wide format:
library(data.table)
setDT(DF)[, rn := .I]
long <- melt(DF, id.vars = c("rn", "Date"), measure.vars = patterns("^ID", "^x|y"),
value.name = c("ID", "value"))
long[order(Date), previous := shift(value, fill = 0), by = ID]
dcast(long, rn + Date ~ variable, value.var = c("ID", "value", "previous"))
rn Date ID_1 ID_2 value_1 value_2 previous_1 previous_2
1: 1 2011-12-21 12 3 15 10 0 0
2: 2 2011-12-22 12 13 50 40 15 0
3: 3 2011-12-22 3 12 20 30 10 50
4: 4 2011-12-23 15 13 30 20 0 40
Alternatively, the final call to dcast() can be replaced by an update while joining:
DF[long, on = .(rn),
c("XX_1", "YY_2") := .(previous[variable == 1L], previous[variable == 2L])][
, rn := NULL]
DF
ID_1 ID_2 Date x_1 y_2 XX_1 YY_2
1: 12 3 2011-12-21 15 10 0 0
2: 12 13 2011-12-22 50 40 15 0
3: 3 12 2011-12-22 20 30 10 50
4: 15 13 2011-12-23 30 20 0 40
which reproduces exactly OP's expected result.
Data
library(data.table)
DF <- fread(
"i ID_1 ID_2 Date x_1 y_2
1 12 3 2011-12-21 15 10
2 12 13 2011-12-22 50 40
3 3 12 2011-12-22 20 30
4 15 13 2011-12-23 30 20 ",
drop = 1L
)
If I understand you correctly, the target ID should be looked up from the left to the right and from the bottom to the top in all the rows strictly above the given ID value. I would write the function to find the coordinates of the preceded ID like that
# find the indices of the preceded ID value
# #id_matrix == your_data_frame[, c("ID_1", "ID_2")]
# [#i_of_row, #i_of_col] are the coordinates of the considered ID
# i_of_row > 1
FindPreviousID <- function(id_matrix, i_of_row, i_of_col) {
shorten_matrix <- id_matrix[1:(i_of_row - 1),,drop = FALSE]
rev_ind <- match(table = rev(t(shorten_matrix)),
x = ids[i_of_row,i_of_col], nomatch = NA_real_)
n_row_found <- floor((length(shorten_matrix) - rev_ind)/2) + 1
n_col_found <- (length(shorten_matrix) - rev_ind) %% ncol(shorten_matrix) + 1
return(c(row = n_row_found, col = n_col_found))
}
...and use it to calculate XX_1 and YY2
# emulate the original dataframe
ID_1 <- c(12,12,3,15,16,3)
ID_2<-c(3,13,12,13,17,15)
ids <- cbind(ID_1, ID_2) # IDs columns
x1 <- c(15, 50, 20, 30, 51, 60)
y2 <- c(10, 40, 30, 20, 53, 62)
vars <- cbind(x1, y2) # x&y columns
# assuming that the first XX_1 & YY_2 should be always 0
indices_XX <- sapply(FUN = function(i) FindPreviousID(id_matrix = ids, i_of_col = 1, i),
X = seq(along.with = ids[, 1])[-1])
indices_YY <- sapply(FUN = function(i) FindPreviousID(id_matrix = ids, i_of_col = 2, i),
X = seq(along.with = ids[, 1])[-1])
# construct XX and YY columns
XX_column <- c(NA, vars[t(indices_XX)])
XX_column[is.na(XX_column)] <- 0
YY_column <- c(NA, vars[t(indices_YY)])
YY_column[is.na(YY_column)] <- 0
Hope, that helps :)
Upd If you are interested to find a pair of IDs instead of the single ID, the function should be redesigned. One of the possible solutions looks like this
FindPreviousIDsPair <- function(id_matrix, i_of_row) {
shorten_matrix <- id_matrix[1:(i_of_row - 1),,drop = FALSE]
string_to_search_for <- id_matrix[i_of_row, ]
string_to_search_for_sorted <-
string_to_search_for[order(string_to_search_for)]
found_rows_boolean <- sapply(FUN = function(i) all(shorten_matrix[i,
order(shorten_matrix[i, ])] ==
string_to_search_for_sorted), X = 1:(i_of_row - 1))
found_row_n <- ifelse(any(found_rows_boolean),
max(which(found_rows_boolean)), NA_real_)
found_col_of_DI1 <- ifelse(any(found_rows_boolean),
match(string_to_search_for[1], shorten_matrix[found_row_n, ]), NA_real_)
found_col_of_DI2 <- ifelse(any(found_rows_boolean),
match(string_to_search_for[2], shorten_matrix[found_row_n, ]), NA_real_)
return(c(found_row_n, found_col_of_DI1, found_col_of_DI2))
}
Application of the redisigned look-up function to calculate XX and YY
indices_of_vars <- sapply(FUN = function(i) FindPreviousIDsPair(id_matrix =
ids, i), X = seq(along.with = ids[, 1])[-1])
indices_XX <- indices_of_vars[1:2, ]
indices_YY <- indices_of_vars[c(1, 3), ]
XX_column <- c(NA, vars[t(indices_XX)])
XX_column[is.na(XX_column)] <- 0
YY_column <- c(NA, vars[t(indices_YY)])
YY_column[is.na(YY_column)] <- 0
Related
I have the following data frame in R:
df <- data.frame(name = c('p1_start','p1_end','p2_start','p2_end','p1_start','p1_end','p2_start','p2_end','p1_start','p1_end','p2_start','p2_end','p1_start','p1_end','p2_start','p2_end'),
time = c(1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31),
target = c(1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2),
comb = c(0,0,0,0,1,1,1,1,0,0,0,0,1,1,1,1))
And another data frame:
data <- data.frame(time = c(2,5,8,14,14,20,21,26,28,28),
name = c('a','b','c','d','e','f','g','h','i','j'))
So, if we take a look at df we could sort the data by target and combination and we will notice that there are basically "groups". For example for target=1 and comb=0 there are four entries p1_start,p1_end,p2_start,p2_end and it is the same for all other target/comb combinations.
On the other side data contains entries with time being a timestamp.
Goal: I want to map the values from both data frames based on time.
Example: The first entry of data has time=2 meaning it happened between p1_start,p1_end so it should get the values target=1 and comb=0 mapped to the data data frame.
Example 2: The entries of data with time=14 happened between p2_start,p2_end so they should get the values target=1 and comb=1 mapped to the data data frame.
Idea: I thought I iterate over df by target and comb and for each combination of them check if there are rows in data whose time is between. The second could be done with the following command:
data[which(data$time > p1_start & data$time < p2_end),]
once I get the rows it is easy to append the values.
Problem: how could I do the iteration? I tried with the following:
df %>%
group_by(target, comb) %>%
print(data[which(data$time > df$p1_start & data$time < df$p2_end),])
But I am getting an error that time has not been initialized
Your problem is best known as performing non-equi join. We need to find a range in some given dataframe that corresponds to each value in one or more given vectors. This is better handled by the data.table package.
We would first transform your df into a format suitable for performing the join and then join data with df by time <= end while time >= start. Here is the code
library(data.table)
setDT(df)[, c("type", "name") := tstrsplit(name, "_", fixed = TRUE)]
df <- dcast(df, ... ~ name, value.var = "time")
cols <- c("target", "comb", "type")
setDT(data)[df, (cols) := mget(paste0("i.", cols)), on = .(time<=end, time>=start)]
After dcast, df looks like this
target comb type end start
1: 1 0 p1 3 1
2: 1 0 p2 7 5
3: 1 1 p1 11 9
4: 1 1 p2 15 13
5: 2 0 p1 19 17
6: 2 0 p2 23 21
7: 2 1 p1 27 25
8: 2 1 p2 31 29
And the output is
> data
time name target comb type
1: 2 a 1 0 p1
2: 5 b 1 0 p2
3: 8 c NA NA <NA>
4: 14 d 1 1 p2
5: 14 e 1 1 p2
6: 20 f NA NA <NA>
7: 21 g 2 0 p2
8: 26 h 2 1 p1
9: 28 i NA NA <NA>
10: 28 j NA NA <NA>
Here is a tidyverse solution:
library(tidyr)
library(dplyr)
df %>%
rename(name_df=name) %>%
mutate(x = time +1) %>%
pivot_longer(
cols = c(time, x),
names_to = "helper",
values_to = "time"
) %>%
right_join(data, by="time") %>%
select(time, name, target, comb)
time name target comb
<dbl> <chr> <dbl> <dbl>
1 2 a 1 0
2 5 b 1 0
3 8 c 1 0
4 14 d 1 1
5 14 e 1 1
6 20 f 2 0
7 21 g 2 0
8 26 h 2 1
9 28 i 2 1
10 28 j 2 1
df <- data.frame(name = c('p1_start','p1_end','p2_start','p2_end','p1_start','p1_end','p2_start','p2_end','p1_start','p1_end','p2_start','p2_end','p1_start','p1_end','p2_start','p2_end'),
time = c(1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31),
target = c(1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2),
comb = c(0,0,0,0,1,1,1,1,0,0,0,0,1,1,1,1))
data <- data.frame(time = c(2,5,8,14,14,20,21,26,28,28),
name = c('a','b','c','d','e','f','g','h','i','j'))
library(fuzzyjoin)
library(tidyverse)
tmp <- df %>%
separate(name,
into = c("p", "period"),
sep = "_",
remove = TRUE) %>%
pivot_wider(
id_cols = c(p, target, comb),
names_from = period,
values_from = time
) %>%
select(-p)
fuzzy_left_join(
x = data,
y = tmp,
by = c("time" = "start",
"time" = "end"),
match_fun = list(`>=`, `<=`))
#> time name target comb start end
#> 1 2 a 1 0 1 3
#> 2 5 b 1 0 5 7
#> 3 8 c NA NA NA NA
#> 4 14 d 1 1 13 15
#> 5 14 e 1 1 13 15
#> 6 20 f NA NA NA NA
#> 7 21 g 2 0 21 23
#> 8 26 h 2 1 25 27
#> 9 28 i NA NA NA NA
#> 10 28 j NA NA NA NA
Created on 2022-01-11 by the reprex package (v2.0.1)
I would like to match multiple conditions from independent data tables onto my main data table.
How can I do this using the data.table package?
What would be the most efficient/fastest way?
I have a mock example, with some mock conditions here to illustrate my question:
main_data <- data.frame( pnum = c(1,2,3,4,5,6,7,8,9,10),
age = c(24,35,43,34,55,24,36,43,34,54),
gender = c("f","m","f","f","m","f","m","f","f","m"))
data_1 <- data.frame( pnum = c(1,4,5,8,9),
value_data_1 = c(1, 2, 1, 1, 1),
date = as.Date(c("2019-01-01", "2018-07-01", "2018-01-01", "2016-07-01", "2016-07-01")))
data_2 <- data.frame( pnum = c(1,5,7,8,9),
value_data_2 = c(1, 2, 1, 1, 2),
date = as.Date(c("2019-01-01", "2018-07-01", "2018-01-01", "2016-07-01", "2016-07-01")))
I would like to create a new variable in my main_data table called "matching" of those rows that match between data_1 and data_2 under multiple conditions:
First, the value of data_1$value_data_1 has to be equal to 1.
Second, the value of data_2$value_data_2 also has to be equal to 1.
Third, the pnum and the date should match between data_1 and data_2.
When all these conditions are met, I would expect the new output of main_data to look like this:
> main_data
pnum age gender matching
1 1 24 f 1
2 2 35 m 0
3 3 43 f 0
4 4 34 f 0
5 5 55 m 0
6 6 24 f 0
7 7 36 m 0
8 8 43 f 1
9 9 34 f 0
10 10 54 m 0
Until now, I programmed each condition seperately and created new placeholder tables in between, but this is not very memory efficient. Is there an efficient way to chain all the conditions using the data.tables package specifically?
Here's one way:
library(data.table)
library(magrittr)
setDT(main_data)
setDT(data_1)
setDT(data_2)
main_data %>%
data_1[., on = .(pnum == pnum) ] %>%
data_2[., on = .(pnum == pnum, date == date) ] %>%
.[, matching := fcoalesce(+(value_data_1 == 1 & value_data_2 == 1), 0L) ] %>%
.[, .(pnum, age, gender, matching) ]
# pnum age gender matching
# 1: 1 24 f 1
# 2: 2 35 m 0
# 3: 3 43 f 0
# 4: 4 34 f 0
# 5: 5 55 m 0
# 6: 6 24 f 0
# 7: 7 36 m 0
# 8: 8 43 f 1
# 9: 9 34 f 0
# 10: 10 54 m 0
I used the magrittr package because I find it useful for portraying the flow code. It is not at all required, and the alternative pipeline for data.table for the same code could be:
data_2[
data_1[main_data, on = .(pnum == pnum) ]
,on = .(pnum == pnum, date == date)
][ ,matching := fcoalesce(+(value_data_1 == 1 & value_data_2 == 1), 0L)
][ ,.(pnum, age, gender, matching) ]
There are other ways to break it out, including the use of temporary (mid-step) variables. (This is mostly style and personal preference.)
You can use something like Reduce(merge, list(...))
library(data.table)
setDT(main_data); setDT(data_1); setDT(data_2)
res <- Reduce(function(x, y) {
merge(x, y, by = "pnum", all.x = TRUE)
}, list(main_data, data_1[, -"date"], data_2[, -"date"]))[, `:=`(
matching = 1L - (value_data_1 != 1 | value_data_2 != 1 | is.na(value_data_1) | is.na(value_data_2)),
value_data_1 = NULL,
value_data_2 = NULL
)]
Output
> res[]
pnum age gender matching
1: 1 24 f 1
2: 2 35 m 0
3: 3 43 f 0
4: 4 34 f 0
5: 5 55 m 0
6: 6 24 f 0
7: 7 36 m 0
8: 8 43 f 1
9: 9 34 f 0
10: 10 54 m 0
I have the following data frame as an example
df <- data.frame(score=letters[1:15], total1=1:15, total2=16:30)
> df
score total1 total2
1 a 1 16
2 b 2 17
3 c 3 18
4 d 4 19
5 e 5 20
6 f 6 21
7 g 7 22
8 h 8 23
9 i 9 24
10 j 10 25
11 k 11 26
12 l 12 27
13 m 13 28
14 n 14 29
15 o 15 30
I would like to aggregate my data frame by sum by grouping the rows having different name, i.e.
groups sum1 sum2
'a-b-c' 6 51
'c-d-e' 21 60
etc
All the given answers to this kind of question assume that the strings repeat in the row.
The usual aggregate function that I use to obtain the summary delivers a different result:
aggregate(df$total1, by=list(sum1=df$score %in% c('a','b','c'), sum2=df$score %in% c('d','e','f')), FUN=sum)
sum1 sum2 x
1 FALSE FALSE 99
2 TRUE FALSE 6
3 FALSE TRUE 15
If you want a tidyverse solution, here is one possibility:
df <- data.frame(score=letters[1:15], total1=1:15, total2=16:30)
df %>%
mutate(groups = case_when(
score %in% c("a","b","c") ~ "a-b-c",
score %in% c("d","e","f") ~ "d-e-f"
)) %>%
group_by(groups) %>%
summarise_if(is.numeric, sum)
returns
# A tibble: 3 x 3
groups total1 total2
<chr> <int> <int>
1 a-b-c 6 51
2 d-e-f 15 60
3 <NA> 99 234
Add a "groups" column with the category value.
df$groups = NA
and then define each group like this:
df$groups[df$score=="a" | df$score=="b" | df$score=="c" ] = "a-b-c"
Finally aggregate by that column.
Here's a solution that works for any sized data frame.
df <- data.frame(score=letters[1:15], total1=1:15, total2=16:30)
# I'm adding a row to demonstrate that the grouping pattern works when the
# number of rows is not equally divisible by 3.
df <- rbind(df, data.frame(score = letters[16], total1 = 16, total2 = 31))
# A vector that represents the correct groupings for the data frame.
groups <- c(rep(1:floor(nrow(df) / 3), each = 3),
rep(floor(nrow(df) / 3) + 1, nrow(df) - length(1:(nrow(df) / 3)) * 3))
# Your method of aggregation by `groups`. I'm going to use `data.table`.
require(data.table)
dt <- as.data.table(df)
dt[, group := groups]
aggDT <- dt[, list(score = paste0(score, collapse = "-"),
total1 = sum(total1), total2 = sum(total2)), by = group][
, group := NULL]
aggDT
score total1 total2
1: a-b-c 6 51
2: d-e-f 15 60
3: g-h-i 24 69
4: j-k-l 33 78
5: m-n-o 42 87
6: p 16 31
I am trying to separate the contents of columns into two rows, and duplicate the row names. Each variable consists of only two numbers (11, 12, 13, 14, 21, 22, etc. or an NA.) This is for conversion to STRUCTURE format, a common population genetics format.
I have this:
population X354045 X430045 X995019
Crater <NA> 11 22
Teton 11 31 11
I would like to have this:
population X354045 X430045 X995019
Crater <NA> 1 2
Crater <NA> 1 2
Teton 1 3 1
Teton 1 1 1
This is a data.table question, so I would just suggest the built-in tstrsplit function for that matter
Reading Your data
library(data.table)
DT <- fread('population X354045 X430045 X995019
Crater NA 11 22
Teton 11 31 11')
Solution (if you have a data.frame, use setDT(DT) in order to convert to a data.table)
DT[, lapply(.SD, function(x) unlist(tstrsplit(x, ""))), by = population]
# population X354045 X430045 X995019
# 1: Crater NA 1 2
# 2: Crater NA 1 2
# 3: Teton 1 3 1
# 4: Teton 1 1 1
Ok, so here is how I would do it. Lets create some data:
vector <- c(10, 11, 12, NA, 13, 14, 15)
First, we need a function that allows you to break each two-digit number into its two digits (and NAs into two NAs):
as.numeric(sapply(vector, function(x) (x %% c(1e2,1e1)) %/% c(1e1,1e0)))
# 1 0 1 1 1 2 NA NA 1 3 1 4 1 5
Now all we have to do is apply this to every relevant column:
DF <- data.frame(population = c("Crater", "Teton"), X354045 = c(NA, 11), X430045 = c(11, 31), X995019 = c(22, 11))
DF2 <- apply(DF[-1], 2, function(y) as.numeric(sapply(y, function(x) (x %% c(1e2,1e1)) %/% c(1e1,1e0))))
Finally, we just combine it with the new population column:
population <- as.character(rep(DF$population, each = 2))
DF3 <- cbind(population, data.frame(DF2))
dd <- read.table(header = TRUE, text = 'population X354045 X430045 X995019
Crater NA 11 22
Teton 11 31 11')
nr <- nrow(dd)
dd <- dd[rep(1:2, each = nr), ]
# population X354045 X430045 X995019
# 1 Crater NA 11 22
# 1.1 Crater NA 11 22
# 2 Teton 11 31 11
# 2.1 Teton 11 31 11
dd[, -1] <- lapply(dd[, -1], function(x) {
idx <- (seq_along(x) %% 2 == 0) + 1L
substr(x, idx, idx)
})
# population X354045 X430045 X995019
# 1 Crater <NA> 1 2
# 1.1 Crater <NA> 1 2
# 2 Teton 1 3 1
# 2.1 Teton 1 1 1
Or just
dd <- dd[rep(1:2, each = nr), ]
dd[, -1] <- lapply(dd[, -1], function(x)
Vectorize(substr)(x, rep(1:2, nr), rep(1:2, nr)))
would work
And the same idea in data.table thanks to #DavidArenburg
library('data.table')
dd <- read.table(header = TRUE, text = 'population X354045 X430045 X995019
Crater NA 11 22
Teton 11 31 11')
setDT(dd)[rep(1:2, each = .N), lapply(.SD, substr, 1:2, 1:2), by = population]
# population X354045 X430045 X995019
# 1: Crater NA 1 2
# 2: Crater NA 1 2
# 3: Teton 1 3 1
# 4: Teton 1 1 1
Or similarly, but avoiding the by part
dd <- setDT(dd)[rep(1:2, each = .N)]
dd[, 2:4 := dd[ ,lapply(.SD, substr, 1:2, 1:2), .SD = -1]]
which should be pretty fast/efficient if you are working with a large data set
I am trying to build a data frame where I have a series of columns which contain a random assignment of another column. The data has some structure which needs to be maintained. Namely I want to randomise the assignment of L many time over, while maintaining the structure in V. I want to take a dataframe that looks like this;
L B V A
1 1 1 2 10.9
2 1 1 2 6.5
3 1 1 2 8.6
4 1 1 3 11.1
5 1 1 4 13.1
6 1 1 6 11.5
And create this;
ID L B V A R1 R2 R3 R4 R5
1 1_1_2 1 1 2 10.9 27 20 19 6 26
2 1_1_2 1 1 2 6.5 27 20 19 6 26
3 1_1_2 1 1 2 8.6 27 20 19 6 26
4 1_1_3 1 1 3 11.1 6 28 4 26 26
5 1_1_4 1 1 4 13.1 16 2 6 14 32
6 1_1_6 1 1 6 11.5 17 21 3 11 25
I can do this manually using the below script, but I wonder if there is a smooth way to make this automated, because I want to do it for hundreds of randomisations to make columns R1, R2, R3.. Rn (so a loop to do this would be preferred to manual repetition of the code).
# Example Data Frame #
df = data.frame(sample(1:33, 1000, replace = T), sample(1:3, 1000, replace = T), sample(1:9, 1000, replace = T), round(rnorm(1000, 10, 2),1))
colnames(df) = c("L", "B", "V", "A")
df = transform(df,id=as.numeric(factor(df$V)))
df = data.frame(as.factor(df[,1]),as.factor(as.numeric(df[,2])),as.factor(df[,5]),as.numeric(df[,4]))
colnames(df) = c("L","B","V","A")
df = df[order(df$L, df$B, df$V),]
rownames(df) = NULL
head(df)
# ID #
df$ID = paste(df[,1], df[,2], df[,3], sep = "_")
ID = unique(as.vector(df$ID))
# R1 #
ID2 = data.frame(ID, sample(ID)); colnames(ID2) = c("ID","R1")
df = merge(df, ID2)
df$R1 = as.factor(do.call(rbind, strsplit(as.vector(df$R1), split="_"))[,1])
# R2 #
ID2 = data.frame(ID, sample(ID)); colnames(ID2) = c("ID","R2")
df = merge(df, ID2)
df$R2 = as.factor(do.call(rbind, strsplit(as.vector(df$R2), split="_"))[,1])
# R3 #
ID2 = data.frame(ID, sample(ID)); colnames(ID2) = c("ID","R3")
df = merge(df, ID2)
df$R3 = as.factor(do.call(rbind, strsplit(as.vector(df$R3), split="_"))[,1])
# R4 #
ID2 = data.frame(ID, sample(ID)); colnames(ID2) = c("ID","R4")
df = merge(df, ID2)
df$R4 = as.factor(do.call(rbind, strsplit(as.vector(df$R4), split="_"))[,1])
# R5 #
ID2 = data.frame(ID, sample(ID)); colnames(ID2) = c("ID","R5")
df = merge(df, ID2)
df$R5 = as.factor(do.call(rbind, strsplit(as.vector(df$R5), split="_"))[,1])
How can I create a loop which will make this happen in n number of columns?
Following from the above code I finally got to an answer:
# ID #
df$ID = paste(df[,1], df[,2], df[,3], sep = "_")
ID = unique(as.vector(df$ID))
n = 5
Rs = as.vector(rep(NA,n))
for(i in 1:n){
Rs[i] = paste("R",i, sep = "")
}
Rs
for(i in 1:n){
df[,5+i] = NA
colnames(df)[5+i] = paste(Rs[i])
ID = unique(as.vector(df$ID))
ID2 = data.frame(ID, sample(ID))
ID2 = merge(df, ID2)
df[5+i] = as.factor(do.call(rbind, strsplit(as.vector(ID2[,6+i]), split="_"))[,1])
}
head(df)
Gives the result:
L B V A ID R1 R2 R3 R4 R5
1 1 1 2 10.1 1_1_2 21 12 27 4 26
2 1 1 4 7.7 1_1_4 7 29 2 9 10
3 1 1 5 9.7 1_1_5 27 27 3 1 22
4 1 1 5 8.3 1_1_5 27 27 3 1 22
5 1 1 7 9.5 1_1_7 13 15 32 19 11
6 1 1 7 12.4 1_1_7 13 15 32 19 11