Add multiple rows based on column string R - r

I have a data frame that looks like this...
patient_ID CBC CBN totindex samp index
120 1007 BLOQ BLOQ 7 8 1
121 1007 BLOQ BLOQ 8 9 1
122 1007 BLOQ BLOQ 9 10 1
123 1007 BLOQ BLOQ 10 11 1
124 1007 BLOQ BLOQ 11 12 1
125 1007 BLOQ BLOQ 12 15 4
126 1007 BLOQ BLOQ 13 16 1
127 1007 BLOQ BLOQ 14 17 1
128 1007 BLOQ BLOQ 15 18 1
129 1007 BLOQ BLOQ 16 19 1
130 1007 BLOQ BLOQ 17 20 1
I created the index column to denote any rows in the samp column to show where the previous row is not in numerical order. For example, if the numbers are a difference of one, I make the index of the row 3. In this case, there was a difference of 2 in the samp column between rows 124 and 125. So I placed a 4. I do this for other examples as well like a difference of 3 or 4 is indexed as a 5 or 6.
How can I insert 1 or 2 or 3 or 4 new blank lines in between the discrepancies?
In this example, I would like this output...
patient_ID CBC CBN totindex samp index
120 1007 BLOQ BLOQ 7 8 1
121 1007 BLOQ BLOQ 8 9 1
122 1007 BLOQ BLOQ 9 10 1
123 1007 BLOQ BLOQ 10 11 1
124 1007 BLOQ BLOQ 11 12 1
125 1007 0 0 0 13 1
126 1007 0 0 0 14 1
127 1007 BLOQ BLOQ 12 15 1
128 1007 BLOQ BLOQ 13 16 1
129 1007 BLOQ BLOQ 14 17 1
130 1007 BLOQ BLOQ 15 18 1
131 1007 BLOQ BLOQ 16 19 1
132 1007 BLOQ BLOQ 17 20 1
Thus, adding 2 rows equal to 0 before the 4 index, keep the patient_ID, remove the 4, and add the 13 and 14 to the samp column.
I have tried a for loop like this...
nidx4 <- as.numeric(rownames(df[grep("4", df$index), ]))
dfnew <- data.frame()
for (idx in 1:length(nidx4)) {
if (idx==1){
df1 = df[1:(nidx4[idx]-1),]
}
else if (idx == length(nidx4)) {
df1 = df[nidx4[idx-1]:nrow(df),]
}
else {
df1 = df[(nidx4[idx-1]):(nidx4[idx]-1),]
}
df1[nrow(df1)+1,] = 0
df1[nrow(df1)+1,] = 0
df1[nrow(df1)-1,21] = df1[nrow(df1)-2,21]+1
df1[nrow(df1),21] = df1[nrow(df1)-1,21]+1
dfnew = rbind(dfnew,df1)
}
for (row in 1:nrow(dfnew)){
if (dfnew[row,"index"] == 0) {dfnew[row,"index"] = 1}
if (dfnew[row,"index"] == 4) {dfnew[row,"index"] = 1}
}
rownames(dfnew) <- NULL
df <- dfnew
But that only accounts for the first few indices. The last iteration of 4 in the index is not accounted for. Also, I had to change the loop code to this because, before, it was stopping at the last iteration of 4 and not including the rest of the data frame.
Any help would be great.
Edit + Answer
This worked for me by adding in all of the column names that I wanted to iterate over.
dfnew <- complete(df, patient_ID, samp = full_seq(samp, period = 1),
fill = list("Sample_Name_(run_ID)" = "no_sample",
Sample_Name = "no_sample", THC = "0", OH_THC = "0",
THC_COOH = "0", THC_COO_gluc = "0", THC_gluc = "0",
CBD = "0", "6aOH_CBD" = "0", "7OH_CBD" = "0",
"6bOH_CBD" = "0", CBD_COOH = "0", CBD_gluc = "0",
CBC = "0", CBN = "0", CBG = "0", THCV = "0", CBDV = "0",
totindex = 0,
index = 1)) %>%
mutate(index = 1)

We may use complete
library(tidyr)
library(dplyr)
complete(df1, patient_ID, samp = full_seq(samp, period = 1),
fill = list(CBC = "0", CBN = "0", totindex = 0, index = 1)) %>%
mutate(index = 1)
-output
# A tibble: 13 × 6
patient_ID samp CBC CBN totindex index
<int> <dbl> <chr> <chr> <int> <dbl>
1 1007 8 BLOQ BLOQ 7 1
2 1007 9 BLOQ BLOQ 8 1
3 1007 10 BLOQ BLOQ 9 1
4 1007 11 BLOQ BLOQ 10 1
5 1007 12 BLOQ BLOQ 11 1
6 1007 13 0 0 0 1
7 1007 14 0 0 0 1
8 1007 15 BLOQ BLOQ 12 1
9 1007 16 BLOQ BLOQ 13 1
10 1007 17 BLOQ BLOQ 14 1
11 1007 18 BLOQ BLOQ 15 1
12 1007 19 BLOQ BLOQ 16 1
13 1007 20 BLOQ BLOQ 17 1
Or use
complete(df1, patient_ID, samp = full_seq(samp, period = 1)) %>%
mutate(across(CBC:CBN, replace_na, "0"),
totindex = replace_na(totindex, 0), index = 1)
Or use
complete(df1, patient_ID, samp = full_seq(samp, period = 1),
fill = setNames(c(as.list(rep('0', 2)), 0, 1),
names(df1)[c(2:4, 6)]))
data
df1 <- structure(list(patient_ID = c(1007L, 1007L, 1007L, 1007L, 1007L,
1007L, 1007L, 1007L, 1007L, 1007L, 1007L), CBC = c("BLOQ", "BLOQ",
"BLOQ", "BLOQ", "BLOQ", "BLOQ", "BLOQ", "BLOQ", "BLOQ", "BLOQ",
"BLOQ"), CBN = c("BLOQ", "BLOQ", "BLOQ", "BLOQ", "BLOQ", "BLOQ",
"BLOQ", "BLOQ", "BLOQ", "BLOQ", "BLOQ"), totindex = 7:17, samp = c(8L,
9L, 10L, 11L, 12L, 15L, 16L, 17L, 18L, 19L, 20L), index = c(1L,
1L, 1L, 1L, 1L, 4L, 1L, 1L, 1L, 1L, 1L)), class = "data.frame",
row.names = c("120",
"121", "122", "123", "124", "125", "126", "127", "128", "129",
"130"))

Related

recode character to numeric for specific conditions in r

I had an original dataset that looks like this.:
> df.1
id score
1 13_B 1
2 13_C 4
3 133_D 5
4 141 2
5 145 3
6 143 4
7 12_B 6
8 12_C 7
9 12_D 9
I needed to do some process that needs all the ids numeric therefore I recoded _B|_C|_D into 1|2|3.
After I finished some processed on the dataset having an extra group column, Here is how my sample dataset looks like:
df.2 <- data.frame(id = c("131","132","133", "141", "145", "143", "121","122","123"),
score = c(1,4,5,2,3,4,6,7,9),
group = c(5,5,5,4,4,4,3,3,3))
> df.2
id score group
1 131 1 5
2 132 4 5
3 133 5 5
4 141 2 4
5 145 3 4
6 143 4 4
7 121 6 3
8 122 7 3
9 123 9 3
At this point, I need to convert the ids back to the original for those items = c(12,13,15). So 15 is not in this dataset but need something that works globally. My desired output is:
> df.3
id score group
1 13_B 1 5
2 13_C 4 5
3 13_D 5 5
4 141 2 4
5 145 3 4
6 143 4 4
7 12_B 6 3
8 12_C 7 3
9 12_D 9 3
Any ideas?
Thanks!
Use str_replace_all to recode the substring replacement by passing a named vector (setNames)
library(dplyr)
library(stringr)
df.1 %>%
mutate(id1 = as.numeric(str_replace_all(str_replace(id, "^(\\d{2})\\d+_(.*)",
"\\1_\\2"), setNames(as.character(c(1, 2, 3)), c("_B", "_C", "_D")))))
-output
id score id1
1 13_B 1 131
2 13_C 4 132
3 133_D 5 133
4 141 2 141
5 145 3 145
6 143 4 143
7 12_B 6 121
8 12_C 7 122
9 12_D 9 123
For replacing from 'df.2'
df.2 %>%
mutate(id2 = case_when(substr(id, 1, 2) %in% c(12, 13, 15) ~
str_replace_all(as.character(id), setNames(c("_B", "_C", "_D"),
str_c(1:3, "$"))), TRUE ~as.character(id)))
-output
id score group id2
1 131 1 5 13_B
2 132 4 5 13_C
3 133 5 5 13_D
4 141 2 4 141
5 145 3 4 145
6 143 4 4 143
7 121 6 3 12_B
8 122 7 3 12_C
9 123 9 3 12_D
data
df.1 <- structure(list(id = c("13_B", "13_C", "133_D", "141", "145",
"143", "12_B", "12_C", "12_D"), score = c(1L, 4L, 5L, 2L, 3L,
4L, 6L, 7L, 9L)), row.names = c("1", "2", "3", "4", "5", "6",
"7", "8", "9"), class = "data.frame")
You may try this:
df.2 %>%
group_by(group) %>%
mutate(group_id=row_number(),
x= paste0("_", LETTERS[2:4])) %>%
mutate(id2 = ifelse(!str_detect(id,"14"), paste0(str_sub(id,1,2),x),id)) %>%
select(id, id2, score, group)
id id2 score group
<chr> <chr> <dbl> <dbl>
1 131 13_B 1 5
2 132 13_C 4 5
3 133 13_D 5 5
4 141 141 2 4
5 145 145 3 4
6 143 143 4 4
7 121 12_B 6 3
8 122 12_C 7 3
9 123 12_D 9 3

Select rows with all longitudinal measurements

I have a longitudinal dataset with ID, Wave (Wave1-4), and Score. Here's sample data with the same structure. The length of the original data is around 2000, with 500 participants total, put in long form.
ID Wave Score
1 1001 1 28
2 1001 2 27
3 1001 3 28
4 1001 4 26
5 1002 1 30
6 1002 3 30
7 1003 1 30
8 1003 2 30
9 1003 3 29
10 1003 4 28
11 1004 1 22
12 1005 1 20
13 1005 2 18
14 1006 1 22
15 1006 2 23
16 1006 3 25
17 1006 4 19
I would like to select the 'ID's with all four measurements of 'Score' available. In other words, I want to select rows of the participants with 'Score' available for all 4 waves.
I've been trying to select rows with 'ID's that have data in all 'Wave's. My tryout so far has been based on this idea: if a participant has all four measurements, the ID will appear in the data four times.
That's why I tried to count the number of IDs,
table(data$id) == 4
and although it showed me the number of each ID appearing in the data, I cannot select the corresponding rows.
all.data <- subset(data, subset=table(data$id) == 4)
Because the length of the original data is different, being in long form. "Length of logical index must be 1 or 2637, not 828" I would need a long-form data for further analysis, so I wish not to change it.
You can try:
df[as.logical(with(df, ave(Wave, ID, FUN = function(x) length(x) == 4))), ]
ID Wave Score
1 1001 1 28
2 1001 2 27
3 1001 3 28
4 1001 4 26
7 1003 1 30
8 1003 2 30
9 1003 3 29
10 1003 4 28
14 1006 1 22
15 1006 2 23
16 1006 3 25
17 1006 4 19
Or if you want to keep your basic idea, a slight modification of #jay.sf code:
df[df$ID %in% names(which(table(df$ID) == 4)), ]
I like your table() approach.
> table(d$ID) == 4
1001 1002 1003 1004 1005 1006
TRUE FALSE TRUE FALSE FALSE TRUE
The interesting IDs are in the names() though. So to get your code to work you could extract the IDs like so
subs <- names(which(table(d$ID) == 4))
and get your desired subset using %in%.
all.data <- subset(d, subset=d$ID %in% subs)
Result
> all.data
ID Wave Score
1 1001 1 28
2 1001 2 27
3 1001 3 28
4 1001 4 26
7 1003 1 30
8 1003 2 30
9 1003 3 29
10 1003 4 28
14 1006 1 22
15 1006 2 23
16 1006 3 25
17 1006 4 19
(BTW: Always make sure with ?<name> that you do not define any existing function names as object names, this will save you a lot of trouble. In your case type ?data in a fresh session before loading the object.)
Data
> dput(d)
structure(list(ID = c(1001L, 1001L, 1001L, 1001L, 1002L, 1002L,
1003L, 1003L, 1003L, 1003L, 1004L, 1005L, 1005L, 1006L, 1006L,
1006L, 1006L), Wave = c(1L, 2L, 3L, 4L, 1L, 3L, 1L, 2L, 3L, 4L,
1L, 1L, 2L, 1L, 2L, 3L, 4L), Score = c(28L, 27L, 28L, 26L, 30L,
30L, 30L, 30L, 29L, 28L, 22L, 20L, 18L, 22L, 23L, 25L, 19L)), class = "data.frame", row.names = c("1",
"2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13",
"14", "15", "16", "17"))
instead of feeding table(data$ID), try with
ID %in% names(table(data$ID)[table(data$ID)==4])
As the table gives you the number of occurrences for each ID (named vector)
This is a quick data.table answer.
library(data.table)
dt <- structure(list(ID = c(1001, 1001, 1001, 1001, 1002, 1002, 1003,
1003, 1003, 1003, 1004, 1005, 1005, 1006, 1006, 1006, 1006),
Wave = c(1, 2, 3, 4, 1, 3, 1, 2, 3, 4, 1, 1, 2, 1, 2, 3,
4), Score = c(28, 27, 28, 26, 30, 30, 30, 30, 29, 28, 22,
20, 18, 22, 23, 25, 19)), row.names = c(NA, -17L), class = c("data.table",
"data.frame"))
dt[ , .(Score, N = uniqueN(.SD)) , by = list(ID), .SDcols = c("Wave")][N == 4,]
> ID Score N
1: 1001 28 4
2: 1001 27 4
3: 1001 28 4
4: 1001 26 4
5: 1003 30 4
6: 1003 30 4
7: 1003 29 4
8: 1003 28 4
9: 1006 22 4
10: 1006 23 4
11: 1006 25 4
12: 1006 19 4
For the sake of completeness, here are two data.table solutions. Both identify those IDs for which Wave has values 1 to 4. One approach uses subsetting, the other one is joining.
Subsetting
library(data.table)
setDT(df)[ID %in% dt[ , which(uniqueN(Wave) == 4L), by = ID]$ID]
ID Wave Score
1: 1001 1 28
2: 1001 2 27
3: 1001 3 28
4: 1001 4 26
5: 1003 1 30
6: 1003 2 30
7: 1003 3 29
8: 1003 4 28
9: 1006 1 22
10: 1006 2 23
11: 1006 3 25
12: 1006 4 19
Joining
library(data.table)
setDT(df)[df[, .N, .(ID, Wave)][, .N, ID][N == 4L, .(ID)], on = "ID"]
which returns the same result.
Data
library(data.table)
fread("
rn ID Wave Score
1 1001 1 28
2 1001 2 27
3 1001 3 28
4 1001 4 26
5 1002 1 30
6 1002 3 30
7 1003 1 30
8 1003 2 30
9 1003 3 29
10 1003 4 28
11 1004 1 22
12 1005 1 20
13 1005 2 18
14 1006 1 22
15 1006 2 23
16 1006 3 25
17 1006 4 19", drop = 1L)

Moving average and moving slope in R

I am looking to separately calculate a 7-day moving average and 7-day moving slope of 'oldvar'.
My sincere apologies that I didn't add the details below in my original post. These are repeated observations for each id which can go from a minimum of 3 observations per id to 100 observations per id. The start day can be different for different IDs, and to make things complicated, the days are not equally spaced, so some IDs have missing days.
Here is the data structure. Please note that 'average' is the variable that I am trying to create as moving 7-day average for each ID:
id day outcome average
1 1 15 100 NA
2 1 16 110 NA
3 1 17 190 NA
4 1 18 130 NA
5 1 19 140 NA
6 1 20 150 NA
7 1 21 160 140
8 1 22 100 140
9 1 23 180 150
10 1 24 120 140
12 2 16 90 NA
13 2 17 110 NA
14 2 18 120 NA
12 2 20 130 NA
15 3 16 110 NA
16 3 18 200 NA
17 3 19 180 NA
18 3 21 170 NA
19 3 22 180 168
20 3 24 210 188
21 3 25 160 180
22 3 27 200 184
Also, would appreciate advice on how to calculate a moving 7-day slope using the same.
Thank you and again many apologies for being unclear the first time around.
The real challenge is to create a data.frame after completing the missing rows. One solution could be using zoo library. The rollapply function will provide a way to assign NA value for the initial rows.
Using data from OP as is, the solution could be:
library(zoo)
library(dplyr)
# Data from OP
df <- structure(list(id = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L),
day = c(15L,16L, 17L, 18L, 19L, 20L, 21L, 22L, 23L, 24L, 16L, 17L, 18L, 20L,
16L, 18L, 19L, 21L, 22L, 24L, 25L, 27L),
outcome = c(100L, 110L,190L, 130L, 140L, 150L, 160L, 100L, 180L, 120L, 90L, 110L, 120L,
130L, 110L, 200L, 180L, 170L, 180L, 210L, 160L, 200L)),
.Names = c("id", "day", "outcome"), row.names = c(NA, -22L), class = "data.frame")
# Make a list without missing day for each id
df_complete <- merge(
expand.grid(id=unique(df$id), day=min(df$day):max(df$day)),
df, all=TRUE)
# Valid range of day for each ID group
df_id_wise_range <- df %>% group_by(id) %>%
summarise(min_day = min(day), max_day = max(day)) %>% as.data.frame()
# id min_day max_day
# 1 1 15 24
# 2 2 16 20
# 3 3 16 27
# Join original df and df_complete and then use df_id_wise_range to
# filter it for valid range of day for each group
df_final <- df_complete %>%
left_join(df, by=c("id","day")) %>%
select(-outcome.y) %>%
inner_join(df_id_wise_range, by="id") %>%
filter(day >= min_day & day <= max_day) %>%
mutate(outcome = outcome.x) %>%
select( id, day, outcome) %>%
as.data.frame()
# Now apply mean to get average
df_average <- df_final %>% group_by(id) %>%
mutate(average= rollapply(outcome, 7, mean, na.rm = TRUE, by = 1,
fill = NA, align = "right", partial = 7)) %>% as.data.frame()
df_average
# The result
# id day outcome average
#1 1 15 100 NA
#2 1 16 110 NA
#3 1 17 190 NA
#4 1 18 130 NA
#5 1 19 140 NA
#6 1 20 150 NA
#7 1 21 160 140.0
#8 1 22 100 140.0
#9 1 23 180 150.0
#10 1 24 120 140.0
#11 2 16 90 NA
#12 2 17 110 NA
#13 2 18 120 NA
#....
#....
#19 3 19 180 NA
#20 3 20 NA NA
#21 3 21 170 NA
#22 3 22 180 168.0
#23 3 23 NA 182.5
#24 3 24 210 188.0
#25 3 25 160 180.0
#26 3 26 NA 180.0
#27 3 27 200 184.0
The steps to calculate moving slope are:
First create a function to return slope
Use function as as part of rollapplyr
#Function to calculate slope
slop_e <- function(z) coef(lm(b ~ a, as.data.frame(z)))[[2]]
#Apply function
z2$slope <- rollapplyr(zoo(z2), 7, slop_e , by.column = FALSE, fill = NA, align = "right")
z2
a b mean_a slope
1 1 21 NA NA
2 2 22 NA NA
3 3 23 NA NA
4 4 24 NA NA
5 5 25 NA NA
6 6 26 NA NA
7 7 27 4 1
8 8 28 5 1
9 9 29 6 1
10 10 30 7 1
11 11 31 8 1
12 12 32 9 1
13 13 33 10 1
14 14 34 11 1
15 15 35 12 1
16 16 36 13 1
17 17 37 14 1
18 18 38 15 1
19 19 39 16 1
20 20 40 17 1

lapply alternative to for loop to append to data frame

I have a data frame:
df<-structure(list(chrom = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 2L,
2L, 3L, 3L, 4L, 4L, 4L, 4L), .Label = c("1", "2", "3", "4"), class = "factor"),
pos = c(10L, 200L, 134L, 400L, 600L, 1000L, 20L, 33L, 40L,
45L, 50L, 55L, 100L, 123L)), .Names = c("chrom", "pos"), row.names = c(NA, -14L), class = "data.frame")
> head(df)
chrom pos
1 1 10
2 1 200
3 1 134
4 1 400
5 1 600
6 1 1000
And I want to calculate pos[i+1] - pos[i] on the sample chromosome (chrom)
By using a for loop over each chrom level, and another over each row I get the expected results:
for (c in levels(df$chrom)){
df_chrom<-filter(df, chrom == c)
df_chrom<-arrange(df_chrom, df_chrom$pos)
for (i in 1:nrow(df_chrom)){
dist<-(df_chrom$pos[i+1] - df_chrom$pos[i])
logdist<-log10(dist)
cat(c, i, df_chrom$pos[i], dist, logdist, "\n")
}
}
However, I want to save this to a data frame, and think that lapply or apply is the right way to go about this. I can't work out how to make the pos[i+1] - pos[i] calculation though (seeing as lapply works on each row/column.
Any pointers would be appreciated
Here's the output from my solution:
chrom index pos dist log10dist
1 1 10 124 2.093422
1 2 134 66 1.819544
1 3 200 200 2.30103
1 4 400 200 2.30103
1 5 600 400 2.60206
1 6 1000 NA NA
2 1 20 13 1.113943
2 2 33 NA NA
3 1 40 5 0.69897
3 2 45 NA NA
4 1 50 5 0.69897
4 2 55 45 1.653213
4 3 100 23 1.361728
4 4 123 NA NA
We could do this using a group by difference. Convert the 'data.frame' to 'data.table' (setDT(df)), grouped by 'chrom', order the 'pos', get the difference of 'pos' (diff) and also log of the difference
library(data.table)
setDT(df)[order(pos), {v1 <- diff(pos)
.(index = seq_len(.N), pos = pos,
dist = c(v1, NA), logdiff = c(log10(v1), NA))}
, by = chrom]
# chrom index pos dist logdiff
# 1: 1 1 10 124 2.093422
# 2: 1 2 134 66 1.819544
# 3: 1 3 200 200 2.301030
# 4: 1 4 400 200 2.301030
# 5: 1 5 600 400 2.602060
# 6: 1 6 1000 NA NA
# 7: 2 1 20 13 1.113943
# 8: 2 2 33 NA NA
# 9: 3 1 40 5 0.698970
#10: 3 2 45 NA NA
#11: 4 1 50 5 0.698970
#12: 4 2 55 45 1.653213
#13: 4 3 100 23 1.361728
#14: 4 4 123 NA NA
Upon running the OP's code the output printed are
#1 1 10 124 2.093422
#1 2 134 66 1.819544
#1 3 200 200 2.30103
#1 4 400 200 2.30103
#1 5 600 400 2.60206
#1 6 1000 NA NA
#2 1 20 13 1.113943
#2 2 33 NA NA
#3 1 40 5 0.69897
#3 2 45 NA NA
#4 1 50 5 0.69897
#4 2 55 45 1.653213
#4 3 100 23 1.361728
#4 4 123 NA NA
We split df by df$chrom (Note that we reorder both df and df$chrom before splitting). Then we go through each of the subgroups (the subgroups are called a in this example) using lapply. On the pos column of each subgroup, we calculate difference (diff) of consecutive elements and take log10. Since diff decreases the number of elements by 1, we add a NA to the end. Finally, we rbind all the subgroups together using do.call.
do.call(rbind, lapply(split(df[order(df$chrom, df$pos),], df$chrom[order(df$chrom, df$pos)]),
function(a) data.frame(a, dist = c(log10(diff(a$pos)), NA))))
# chrom pos dist
#1.1 1 10 2.093422
#1.3 1 134 1.819544
#1.2 1 200 2.301030
#1.4 1 400 2.301030
#1.5 1 600 2.602060
#1.6 1 1000 NA
#2.7 2 20 1.113943
#2.8 2 33 NA
#3.9 3 40 0.698970
#3.10 3 45 NA
#4.11 4 50 0.698970
#4.12 4 55 1.653213
#4.13 4 100 1.361728
#4.14 4 123 NA

how to convert rows into columns without repeition

I have a data like this:
1 233
1 333
1 455
1 345
2 543
2 433
2 344
2 400
3 444
3 111
3 000
3 432
I want to change it to this new dataset like this:
1 2 3
233 543 444
333 433 111
455 344 000
345 400 432
How can I do this in R? do any body knows a script for this. please note that my real data is very larger and number of rows is large.
With x as your data frame with columns V1 and V2, you can add indices counting the elements in each level:
> x$V0 <- ave(x$V1, x$V1, FUN=seq_along)
> x
V1 V2 V0
1 1 233 1
2 1 333 2
3 1 455 3
4 1 345 4
5 2 543 1
6 2 433 2
7 2 344 3
8 2 400 4
9 3 444 1
10 3 111 2
11 3 0 3
12 3 432 4
Now apply reshape:
> reshape(x, direction='wide', timevar='V1', idvar='V0')
V0 V2.1 V2.2 V2.3
1 1 233 543 444
2 2 333 433 111
3 3 455 344 0
4 4 345 400 432
x:
structure(list(V1 = c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L,
3L, 3L), V2 = c(233L, 333L, 455L, 345L, 543L, 433L, 344L, 400L,
444L, 111L, 0L, 432L)), .Names = c("V1", "V2"), class = "data.frame", row.names = c(NA,
-12L))

Resources