Hi I have a bunch of hydrological data on streamflow(Q) that I want to standardize. Data is stored an a large nested table with a layout like the one below that I need to keep:
Flowtestlist <- list(list("910" = data.frame( Q=c(650, 720, 550, 580, 800)),
"950" = data.frame( Q=c(550, 770, 520, 540, 790))),
list ("910" = data.frame( Q=c(450, 620, 750, 580, 800)),
"950" = data.frame( Q=c(650, 750, 580, 520, 890))))
I have levels [[1]] and [[2]], in reality, I have 9 of them and those are also model numbers. Within each model I have 18 subbasins numbered 910, 950, 1012, 1087 etc (in the example above just two subbasins 910, 950 for simplicity). The subbasins contain data on streamflow (Q).
There's also a lookup table:
test_model <- c(1,1,2,2)
test_subbasin <- c(910,950,910,950)
Q_mean <- c(870,765,823,689)
FlowtestDF <- data.frame(test_model, test_subbasin, Q_mean)
This data frame includes streamflow means (Q_mean) for the reference period for each model and subbasin. I want to take each Q from the nested table and find the matching model number and subbasin in the lookup table and divide it to get the standardized streamflow Q_st.
fun_st <- function(x, y=FlowtestDF) {
x$Q_st <- x$Q/y$Q_mean
x <- x
}
testresult <- lapply(Flowtestlist, lapply, fun_st)
It doesn't work. As I understand the function can't find the appropriate location of the needed number in the lookup table (model and subbasin). How can I make this work, while keeping the nested table structure of the data?
Are you looking for this?
Map(\(x, y) lapply(y[match(x$test_subbasin, names(y))], \(i) i / x$Q_mean),
split(FlowtestDF, FlowtestDF$test_model),
Flowtestlist)
# $`1`
# $`1`$`910`
# Q
# 1 0.7471264
# 2 0.9411765
# 3 0.6321839
# 4 0.7581699
# 5 0.9195402
#
# $`1`$`950`
# Q
# 1 0.6321839
# 2 1.0065359
# 3 0.5977011
# 4 0.7058824
# 5 0.9080460
#
#
# $`2`
# $`2`$`910`
# Q
# 1 0.5467801
# 2 0.8998549
# 3 0.9113001
# 4 0.8417997
# 5 0.9720535
#
# $`2`$`950`
# Q
# 1 0.7897934
# 2 1.0885341
# 3 0.7047388
# 4 0.7547170
# 5 1.0814095
Note: If you're (still) using R<4.1, instead of e.g. \(x, y) use function(x, y).
It is easier to do the processing if you have data in a flat dataframe. If for some reason you have to keep the dataframe in nested structure you may split it again.
library(dplyr)
library(purrr)
map_df(Flowtestlist, ~bind_rows(., .id = 'test_subbasin'), .id = 'test_model') %>%
type.convert(as.is = TRUE) %>%
left_join(FlowtestDF, by = c('test_subbasin', 'test_model')) %>%
mutate(Q_st = Q/Q_mean) %>%
split(.$test_model) %>%
map(~.x %>% select(Q, Q_st) %>% split(.x$test_subbasin))
#$`1`
#$`1`$`910`
# Q Q_st
#1 650 0.7471264
#2 720 0.8275862
#3 550 0.6321839
#4 580 0.6666667
#5 800 0.9195402
#$`1`$`950`
# Q Q_st
#6 550 0.7189542
#7 770 1.0065359
#8 520 0.6797386
#9 540 0.7058824
#10 790 1.0326797
#$`2`
#$`2`$`910`
# Q Q_st
#11 450 0.5467801
#12 620 0.7533414
#13 750 0.9113001
#14 580 0.7047388
#15 800 0.9720535
#$`2`$`950`
# Q Q_st
#16 650 0.9433962
#17 750 1.0885341
#18 580 0.8417997
#19 520 0.7547170
#20 890 1.2917271
library(tidyr)
extr <- function(x){
a <- data.frame(x)
names(a) <- names(x)
a$test_model <- parent.frame()$i
a <- pivot_longer(a,setdiff(names(a),'test_model'),names_to = 'test_subbasin',values_to = 'Q')
a
}
to_df <- lapply(Flowtestlist,extr)
df <- do.call(rbind,to_df)
with_lookup <- merge(df,FlowtestDF,by =c('test_model','test_subbasin'))
with_lookup$Q_st <- with_lookup$Q/with_lookup$Q_mean
with_lookup
output;
test_model test_subbasin Q Q_mean Q_st
<int> <chr> <dbl> <dbl> <dbl>
1 1 910 650 870 0.747
2 1 910 720 870 0.828
3 1 910 550 870 0.632
4 1 910 580 870 0.667
5 1 910 800 870 0.920
6 1 950 550 765 0.719
7 1 950 770 765 1.01
8 1 950 520 765 0.680
9 1 950 540 765 0.706
10 1 950 790 765 1.03
11 2 910 450 823 0.547
12 2 910 620 823 0.753
13 2 910 750 823 0.911
14 2 910 580 823 0.705
15 2 910 800 823 0.972
16 2 950 650 689 0.943
17 2 950 750 689 1.09
18 2 950 580 689 0.842
19 2 950 520 689 0.755
20 2 950 890 689 1.29
The following will derive the required output ...
df <- data.frame(test_subbasin = unlist(Flowtestlist), ref = names(unlist(Flowtestlist)))
df$Q_st <- df$test_subbasin / FlowtestDF$Q_mean[match(gsub("\\..*", "", df$ref), FlowtestDF$test_subbasin)]
df
# test_subbasin ref Q_st
# 1 650 910.Q1 0.7471264
# 2 720 910.Q2 0.8275862
# 3 550 910.Q3 0.6321839
# 4 580 910.Q4 0.6666667
# 5 800 910.Q5 0.9195402
# 6 550 950.Q1 0.7189542
# 7 770 950.Q2 1.0065359
# 8 520 950.Q3 0.6797386
# 9 540 950.Q4 0.7058824
# 10 790 950.Q5 1.0326797
# 11 450 910.Q1 0.5172414
# 12 620 910.Q2 0.7126437
# 13 750 910.Q3 0.8620690
# 14 580 910.Q4 0.6666667
# 15 800 910.Q5 0.9195402
# 16 650 950.Q1 0.8496732
# 17 750 950.Q2 0.9803922
# 18 580 950.Q3 0.7581699
# 19 520 950.Q4 0.6797386
# 20 890 950.Q5 1.1633987
Related
I used this method to gather mean and sd result successly before here .And then, I tried to use this methond to gather my gene counts DEG data with "logFC","cil","cir","ajustP_value" .But I failed because something wrong with my result.
Just like this:
data_1<-data.frame(matrix(sample(1:1200,1200,replace = T),48,25))
names(data_1) <- c(paste0("Gene_", 1:25))
rownames(data_1)<-NULL
head(data_1)
A<-paste0(1:48,"_logFC")
data_logFC<-data.frame(A=A,data_1)
#
data_2<-data.frame(matrix(sample(1:1200,1200,replace = T),48,25))
names(data_2) <- c(paste0("Gene_", 1:25))
rownames(data_1)<-NULL
B_L<-paste0(1:48,"_CI.L")
data_CIL<-data.frame(A=B_L,data_2)
data_CIL[1:48,1:6]
#
data_3<-data.frame(matrix(sample(1:1200,1200,replace = T),48,25))
names(data_3) <- c(paste0("Gene_", 1:25))
rownames(data_3)<-NULL
C_R<-paste0(1:48,"_CI.R")
data_CIR<-data.frame(A=C_R,data_3)
data_CIR[1:48,1:6]
#
data_4<-data.frame(matrix(sample(1:1200,1200,replace = T),48,25))
names(data_4) <- c(paste0("Gene_", 1:25))
rownames(data_4)<-NULL
D<-paste0(1:48,"_adj.P.Val")
data_ajustP<-data.frame(A=D,data_4)
data_ajustP[1:48,1:6]
# combine data_logFC data_CIL data_CIR data_ajustP
data <- bind_rows(list(
logFC = data_logFC,
CIL = data_CIL,
CIR =data_CIR,
AJSTP=data_ajustP
), .id = "stat")
data[1:10,1:6]
data_DEG<- data %>%
pivot_longer(-c(stat,A), names_to = "Gene", values_to = "value") %>%pivot_wider(names_from = "stat", values_from = "value")
head(data_DEG,100)
str(data_DEG$CIL)
> head(data_DEG,100)
# A tibble: 100 x 6
A Gene logFC CIL CIR AJSTP
<chr> <chr> <int> <int> <int> <int>
1 1_logFC Gene_1 504 NA NA NA
2 1_logFC Gene_2 100 NA NA NA
3 1_logFC Gene_3 689 NA NA NA
4 1_logFC Gene_4 779 NA NA NA
5 1_logFC Gene_5 397 NA NA NA
6 1_logFC Gene_6 1152 NA NA NA
7 1_logFC Gene_7 780 NA NA NA
8 1_logFC Gene_8 155 NA NA NA
9 1_logFC Gene_9 142 NA NA NA
10 1_logFC Gene_10 1150 NA NA NA
# … with 90 more rows
Why is there so many NAs ?
Can somebody help me ? Vary thankful.
EDITE:
I confused the real sample group of my data. So I reshape my data without a right index.
Here is my right method:
data[1:10,1:6]
data<-separate(data,A,c("Name","stat2"),"_")
data<-data[,-3]
data_DEG<- data %>%
pivot_longer(-c(stat,Name), names_to = "Gene", values_to = "value") %>%pivot_wider(names_from = "stat", values_from = "value")
head(data_DEG,10)
tail(data_DEG,10)
> head(data_DEG,10)
# A tibble: 10 x 6
Name Gene logFC CIL CIR AJSTP
<chr> <chr> <int> <int> <int> <int>
1 1 Gene_1 504 1116 774 278
2 1 Gene_2 100 936 448 887
3 1 Gene_3 689 189 718 933
4 1 Gene_4 779 943 690 19
5 1 Gene_5 397 976 40 135
6 1 Gene_6 1152 304 343 647
7 1 Gene_7 780 1076 796 1024
8 1 Gene_8 155 645 469 180
9 1 Gene_9 142 256 889 1047
10 1 Gene_10 1150 976 1194 670
> tail(data_DEG,10)
# A tibble: 10 x 6
Name Gene logFC CIL CIR AJSTP
<chr> <chr> <int> <int> <int> <int>
1 48 Gene_16 448 633 1080 1122
2 48 Gene_17 73 772 14 388
3 48 Gene_18 652 999 699 912
4 48 Gene_19 600 1163 512 241
5 48 Gene_20 428 1119 1142 348
6 48 Gene_21 66 553 240 82
7 48 Gene_22 753 1119 630 117
8 48 Gene_23 1017 305 1120 447
9 48 Gene_24 432 1175 447 670
10 48 Gene_25 482 394 371 696
It's a perfect result!!
This question already has answers here:
Reshape horizontal to to long format using pivot_longer
(3 answers)
Closed 2 years ago.
Thank you all for your answers, I thought I was smarter than I am and hoped I would've understood any of it. I think I messed up my visualisation of my data aswell. I have edited my post to better show my sample data. Sorry for the inconvenience, and I truly hope that someone can help me.
I have a question about reshaping my data. The data collected looks as such:
data <- read.table(header=T, text='
pid measurement1 Tdays1 measurement2 Tdays2 measurement3 Tdays3 measurment4 Tdays4
1 1356 1435 1483 1405 1563 1374 NA NA
2 943 1848 1173 1818 1300 1785 NA NA
3 1590 185 NA NA NA NA 1585 294
4 130 72 443 70 NA NA 136 79
4 140 82 NA NA NA NA 756 89
4 220 126 266 124 NA NA 703 128
4 166 159 213 156 476 145 776 166
4 380 189 583 173 NA NA 586 203
4 353 231 510 222 656 217 526 240
4 180 268 NA NA NA NA NA NA
4 NA NA NA NA NA NA 580 278
4 571 334 596 303 816 289 483 371
')
Now i would like it to look something like this:
PID Time Value
1 1435 1356
1 1405 1483
1 1374 1563
2 1848 943
2 1818 1173
2 1785 1300
3 185 1590
... ... ...
How would i tend to get there? I have looked up some things about wide to longformat, but it doesn't seem to do the trick. Am reletively new to Rstudio and Stackoverflow (if you couldn't tell that already).
Kind regards, and thank you in advance.
Here is a slightly different pivot_longer() version.
library(tidyr)
library(dplyr)
dw %>%
pivot_longer(cols = -PID, names_to =".value", names_pattern = "(.+)[0-9]")
# A tibble: 9 x 3
PID T measurement
<dbl> <dbl> <dbl>
1 1 1 100
2 1 4 200
3 1 7 50
4 2 2 150
5 2 5 300
6 2 8 60
7 3 3 120
8 3 6 210
9 3 9 70
The names_to = ".value" argument creates new columns from column names based on the names_pattern argument. The names_pattern argument takes a special regex input. In this case, here is the breakdown:
(.+) # match everything - anything noted like this becomes the ".values"
[0-9] # numeric characters - tells the pattern that the numbers
# at the end are excluded from ".values". If you have multiple digit
# numbers, use [0-9*]
In the last edit you asked for a solution that is easy to understand. A very simple approach would be to stack the measurement columns on top of each other and the Tdays columns on top of each other. Although specialty packages make things very concise and elegant, for simplicity we can solve this without additional packages. Standard R has a convenient function aptly named stack, which works like this:
> exp <- data.frame(value1 = 1:5, value2 = 6:10)
> stack(exp)
values ind
1 1 value1
2 2 value1
3 3 value1
4 4 value1
5 5 value1
6 6 value2
7 7 value2
8 8 value2
9 9 value2
10 10 value2
We can stack measurements and Tdays seperately and then combine them via cbind:
data <- read.table(header=T, text='
pid measurement1 Tdays1 measurement2 Tdays2 measurement3 Tdays3 measurement4 Tdays4
1 1356 1435 1483 1405 1563 1374 NA NA
2 943 1848 1173 1818 1300 1785 NA NA
3 1590 185 NA NA NA NA 1585 294
4 130 72 443 70 NA NA 136 79
4 140 82 NA NA NA NA 756 89
4 220 126 266 124 NA NA 703 128
4 166 159 213 156 476 145 776 166
4 380 189 583 173 NA NA 586 203
4 353 231 510 222 656 217 526 240
4 180 268 NA NA NA NA NA NA
4 NA NA NA NA NA NA 580 278
4 571 334 596 303 816 289 483 371
')
cbind(stack(data, c(measurement1, measurement2, measurement3, measurement4)),
stack(data, c(Tdays1, Tdays2, Tdays3, Tdays4)))
Which keeps measurements and Tdays neatly together but leaves us without pid which we can add using rep to replicate the original pid 4 times:
result <- cbind(pid = rep(data$pid, 4),
stack(data, c(measurement1, measurement2, measurement3, measurement4)),
stack(data, c(Tdays1, Tdays2, Tdays3, Tdays4)))
The head of which looks like
> head(result)
pid values ind values ind
1 1 1356 measurement1 1435 Tdays1
2 2 943 measurement1 1848 Tdays1
3 3 1590 measurement1 185 Tdays1
4 4 130 measurement1 72 Tdays1
5 4 140 measurement1 82 Tdays1
6 4 220 measurement1 126 Tdays1
As I said above, this is not the order you expected and you can try to sort this data.frame, if that is of any concern:
result <- result[order(result$pid), c(1, 4, 2)]
names(result) <- c("pid", "Time", "Value")
leading to the final result
> head(result)
pid Time Value
1 1 1435 1356
13 1 1405 1483
25 1 1374 1563
37 1 NA NA
2 2 1848 943
14 2 1818 1173
tidyverse solution
library(tidyverse)
dw %>%
pivot_longer(-PID) %>%
mutate(name = gsub('^([A-Za-z]+)(\\d+)$', '\\1_\\2', name )) %>%
separate(name, into = c('A', 'B'), sep = '_', convert = T) %>%
pivot_wider(names_from = A, values_from = value)
Gives the following output
# A tibble: 9 x 4
PID B T measurement
<int> <int> <int> <int>
1 1 1 1 100
2 1 2 4 200
3 1 3 7 50
4 2 1 2 150
5 2 2 5 300
6 2 3 8 60
7 3 1 3 120
8 3 2 6 210
9 3 3 9 70
Considering a dataframe, df like the following:
PID T1 measurement1 T2 measurement2 T3 measurement3
1 1 100 4 200 7 50
2 2 150 5 300 8 60
3 3 120 6 210 9 70
You can use this solution to get your required dataframe:
iters = seq(from = 4, to = length(colnames(df))-1, by = 2)
finalDf = df[, c(1,2,3)]
for(j in iters){
tobind = df[, c(1,j,j+1)]
finalDf = rbind(finalDf, tobind)
}
finalDf = finalDf[order(finalDf[,1]),]
print(finalDf)
The output of the print statement is this:
PID T1 measurement1
1 1 1 100
4 1 4 200
7 1 7 50
2 2 2 150
5 2 5 300
8 2 8 60
3 3 3 120
6 3 6 210
9 3 9 70
Maybe you can try reshape like below
reshape(
setNames(data, gsub("(\\d+)$", "\\.\\1", names(data))),
direction = "long",
varying = 2:ncol(data)
)
I have a dataframe that looks like this:
(Example edited)
df <- data.frame(Subject = c(rep("A", 9), rep("B", 8)),
Trial = c(1,1,2,3,4,4,5,6,6,1,2,2,3,4,5,5,6),
Feature_1 = c(rep(123, 2), 234, 345, rep(456, 2), 567, rep(678, 2), 831, rep(444, 2), 461, 921, rep(436, 2), 111),
Feature_2 = c(rep(321, 2), 543, 654, rep(765, 2), 876, rep(987, 2), 912, rep(302, 2), 900, 555, rep(382, 2), 197),
Feature_3 = c(rep(190, 2), 459, 392, rep(398, 2), 492, rep(587, 2), 761, rep(901, 2), 783, 312, rep(880, 2), 229),
Feature_correct = NA)
df
Subject Trial Feature_1 Feature_2 Feature_3 Feature_correct
1 A 1 123 321 190 NA
2 A 1 123 321 190 NA
3 A 2 234 543 459 NA
4 A 3 345 654 392 NA
5 A 4 456 765 398 NA
6 A 4 456 765 398 NA
7 A 5 567 876 492 NA
8 A 6 678 987 587 NA
9 A 6 678 987 587 NA
10 B 1 831 912 761 NA
11 B 2 444 302 901 NA
12 B 2 444 302 901 NA
13 B 3 461 900 783 NA
14 B 4 921 555 312 NA
15 B 5 436 382 880 NA
16 B 5 436 382 880 NA
17 B 6 111 197 229 NA
What I need is for the Feature_correct column to contain the values from Feature_n depending on Trial for each Subject. So:
Subject A & Trials 1 and 2: Feature_correct contains the values for Subject A & Trials 1 and 2 under Feature_1 (respectively).
Subject A & Trials 3 and 4: Feature_correct contains the values for Subject A & Trials 3 and 4 under Feature_2 (respectively).
Subject A & Trials 5 and 6: Feature_correct contains the values for Subject A & Trials 5 and 6 under Feature_3 (respectively).
and so on for Subject B.
This is my goal:
df$Feature_goal <- c(rep(123, 2), 234, 654, rep(765, 2), 492, rep(587, 2), 831, rep(444, 2), 900, 555, rep(880, 2), 229)
head(df)
Subject Trial Feature_1 Feature_2 Feature_3 Feature_correct Feature_goal
1 A 1 123 321 190 NA 123
2 A 1 123 321 190 NA 123
3 A 2 234 543 459 NA 234
4 A 3 345 654 392 NA 654
5 A 4 456 765 398 NA 765
6 A 4 456 765 398 NA 765
I know how to do this manually (specifying the Subject name and Trial number in the syntax), but I'd like to create a loop (or whatever else works) so that I don't have to type the name of each Subject (in my real dataset I have many participants and many "Feature" variables).
I've tried this for loop, but I get an error:
df <- for(i in 1:nrow(df$Subject)) {
if(df$Trial %in% c(1,2)){
df[df$Subject == i $ df$Trial %in% c(1,2),]$Feature_correct = df[df$Subject == i & df$Trial %in% c(1,2),]$Feature_1
}
if(df$Trial %in% c(3,4)){
df[df$Subject == i $ df$Trial %in% c(3,4),]$Feature_correct = df[df$Subject == i & df$Trial %in% c(3,4),]$Feature_2
}
if(df$Trial %in% c(5,6)){
df[df$Subject == i $ df$Trial %in% c(5,6),]$Feature_correct = df[df$Subject == i & df$Trial %in% c(5,6),]$Feature_3
}
}
> Error in 1:nrow(df$Subject) : argument of length 0
Indeed,
nrow(df$Subject)
> NULL
Would anyone know how to make this work (with the loop or in any other way)?
A vectorized way would be to create a row/column index by pasting "Feature" with Trial number to match it with column names and subset values from original dataframe.
df$Feature_Goal <- df[cbind(seq_len(nrow(df)),
match(paste0("Feature_", df$Trial), names(df)))]
df
# Subject Trial Feature_1 Feature_2 Feature_3 Feature_correct Feature_Goal
#1 A 1 123 321 190 NA 123
#2 A 1 123 321 190 NA 123
#3 A 2 234 543 459 NA 543
#4 A 2 234 543 459 NA 543
#5 A 3 345 654 392 NA 392
#6 A 3 345 654 392 NA 392
#7 B 1 456 765 398 NA 456
#8 B 1 456 765 398 NA 456
#9 B 2 567 876 492 NA 876
#10 B 2 567 876 492 NA 876
#11 B 3 678 987 587 NA 587
#12 B 3 678 987 587 NA 587
Here is a solution using a loop.
for (i in 1:3) {
idx <- which(df$Trial == i)
df[idx,6] <- df[idx,i+2]
}
I have a data frame like this:
df <- data.frame(x=c(7,5,4),y=c(100,100,100),w=c(170,170,170),z=c(132,720,1256))
I create a new column using mapply:
set.seed(123)
library(truncnorm)
df$res <- mapply(rtruncnorm,df$x,df$y,df$w,df$z,25)
So, I got:
> df
#x y w z res
#1 7 100 170 132 117.9881, 126.2456, 133.7627, 135.2322, 143.5229, 100.3735, 114.8287
#2 5 100 170 720 168.8581, 169.4955, 169.6461, 169.8998, 169.0343
#3 4 100 170 1256 169.7245, 167.6744, 169.7025, 169.4441
#dput(df)
df <- structure(list(x = c(7, 5, 4), y = c(100, 100, 100), w = c(170,
170, 170), z = c(132, 720, 1256), res = list(c(117.988108836195,
126.245562762918, 133.762709785614, 135.232193379024, 143.52290514973,
100.373469134837, 114.828678702662), c(168.858147661715, 169.495493758985,
169.646123183828, 169.899849943838, 169.034333943479), c(169.724470294466,
167.674371713068, 169.70250974042, 169.444134892323))), .Names = c("x",
"y", "w", "z", "res"), row.names = c(NA, -3L), class = "data.frame")
But what I really need is repeat each row of df dataframe according to the df$res result as follows:
> df2
# x y w z res
#1 7 100 170 132 117.9881
#2 7 100 170 132 126.2456
#3 7 100 170 132 133.7627
#4 7 100 170 132 135.2322
#5 7 100 170 132 143.5229
#6 7 100 170 132 100.3735
#7 7 100 170 132 114.8287
#8 5 100 170 720 168.8581
#9 5 100 170 720 169.4955
#10 5 100 170 720 169.6461
#11 5 100 170 720 169.8998
#12 5 100 170 720 169.0343
#13 4 100 170 1256 169.7245
#14 4 100 170 1256 167.6744
#15 4 100 170 1256 169.7025
#16 4 100 170 1256 169.4441
How, do I achieve this efficiently? I need to apply this to a big dataframe
df <- data.frame(x=c(7,5,4),y=c(100,100,100),w=c(170,170,170),z=c(132,720,1256))
set.seed(123)
l <- mapply(rtruncnorm,df$x,df$y,df$w,df$z,25)
cbind.data.frame(df[rep(seq_along(l), lengths(l)),],
res = unlist(l))
# x y w z res
# 1 7 100 170 132 117.9881
# 1.1 7 100 170 132 126.2456
# 1.2 7 100 170 132 133.7627
# 1.3 7 100 170 132 135.2322
# 1.4 7 100 170 132 143.5229
# 1.5 7 100 170 132 100.3735
# 1.6 7 100 170 132 114.8287
# 2 5 100 170 720 168.8581
# 2.1 5 100 170 720 169.4955
# 2.2 5 100 170 720 169.6461
# 2.3 5 100 170 720 169.8998
# 2.4 5 100 170 720 169.0343
# 3 4 100 170 1256 169.7245
# 3.1 4 100 170 1256 167.6744
# 3.2 4 100 170 1256 169.7025
# 3.3 4 100 170 1256 169.4441
Try this based on your given df:
df$res <- sapply(df$res, paste0, collapse=",")
do.call(rbind, apply(df, 1, function(x) do.call(expand.grid, strsplit(x, ","))))
# x y w z res
# 1 7 100 170 132 117.988108836195
# 2 7 100 170 132 126.245562762918
# 3 7 100 170 132 133.762709785614
# 4 7 100 170 132 135.232193379024
# 5 7 100 170 132 143.52290514973
# 6 7 100 170 132 100.373469134837
# 7 7 100 170 132 114.828678702662
# 8 5 100 170 720 168.858147661715
# 9 5 100 170 720 169.495493758985
# 10 5 100 170 720 169.646123183828
# 11 5 100 170 720 169.899849943838
# 12 5 100 170 720 169.034333943479
# 13 4 100 170 1256 169.724470294466
# 14 4 100 170 1256 167.674371713068
# 15 4 100 170 1256 169.70250974042
# 16 4 100 170 1256 169.444134892323
Task:
I have to check that if the value in the data vector is above from the given threshold,
If in my data vector, I found 5 consecutive values greater then the given threshold then I keep these values as they are.
If I have less then 5 values (not 5 consecutive values) then I will replace these values with NA's.
The sample data and required output is shown below. In this example the threshold value is 1000. X is input data variable and the desired output is: Y = X(Threshold > 1000)
X Y
580 580
457 457
980 980
1250 NA
3600 NA
598 598
1200 1200
1345 1345
9658 9658
1253 1253
4500 4500
1150 1150
596 596
594 594
550 550
1450 NA
320 320
1780 NA
592 592
590 590
I have used the following code in R for my desired output but unable to get the appropriate one:
for (i in 1:nrow(X)) # X is my data vector
{counter=0
if (X[i]>10000)
{
for (j in i:(i+4))
{
if (X[j]>10000)
{counter=counter+1}
}
ifelse (counter < 5, NA, X[j])
}
X[i]<- NA
}
X
I am sure that the above code contain some error. I need help in the form of either a new code or modifying this code or any package in R.
Here is an approach using dplyr, using a cumulative sum of diff(x > 1000) to group the values.
library(dplyr)
df <- data.frame(x)
df
# x
# 1 580
# 2 457
# 3 980
# 4 1250
# 5 3600
# 6 598
# 7 1200
# 8 1345
# 9 9658
# 10 1253
# 11 4500
# 12 1150
# 13 596
# 14 594
# 15 550
# 16 1450
# 17 320
# 18 1780
# 19 592
# 20 590
df %>% mutate(group = cumsum(c(0, abs(diff(x>1000))))) %>%
group_by(group) %>%
mutate(count = n()) %>%
ungroup() %>%
mutate(y = ifelse(x<1000 | count > 5, x, NA))
# x group count y
# (int) (dbl) (int) (int)
# 1 580 0 3 580
# 2 457 0 3 457
# 3 980 0 3 980
# 4 1250 1 2 NA
# 5 3600 1 2 NA
# 6 598 2 1 598
# 7 1200 3 6 1200
# 8 1345 3 6 1345
# 9 9658 3 6 9658
# 10 1253 3 6 1253
# 11 4500 3 6 4500
# 12 1150 3 6 1150
# 13 596 4 3 596
# 14 594 4 3 594
# 15 550 4 3 550
# 16 1450 5 1 NA
# 17 320 6 1 320
# 18 1780 7 1 NA
# 19 592 8 2 592
# 20 590 8 2 590
Another approach :
Y<-rep(NA,nrow(X))
for (i in 1:nrow(X)) {
if (X[i,1]<1000) {Y[i]<-X[i,1]} else if (sum(X[i:min((i+4),nrow(X)),1]>1000)>=5) {
Y[i:min((i+4),nrow(X))]<-X[i:min((i+4),nrow(X)),1]}
}
returns
> Y
[1] 580 457 980 NA NA 598 1200 1345 9658 1253 4500 1150 596 594 550 NA 320 NA 592 590
This assumes that the values of X are in the first column of a dataframe named X.
It then creates Y with NAand only change the values if the criteria are met.