I have a data frame like this:
df <- data.frame(x=c(7,5,4),y=c(100,100,100),w=c(170,170,170),z=c(132,720,1256))
I create a new column using mapply:
set.seed(123)
library(truncnorm)
df$res <- mapply(rtruncnorm,df$x,df$y,df$w,df$z,25)
So, I got:
> df
#x y w z res
#1 7 100 170 132 117.9881, 126.2456, 133.7627, 135.2322, 143.5229, 100.3735, 114.8287
#2 5 100 170 720 168.8581, 169.4955, 169.6461, 169.8998, 169.0343
#3 4 100 170 1256 169.7245, 167.6744, 169.7025, 169.4441
#dput(df)
df <- structure(list(x = c(7, 5, 4), y = c(100, 100, 100), w = c(170,
170, 170), z = c(132, 720, 1256), res = list(c(117.988108836195,
126.245562762918, 133.762709785614, 135.232193379024, 143.52290514973,
100.373469134837, 114.828678702662), c(168.858147661715, 169.495493758985,
169.646123183828, 169.899849943838, 169.034333943479), c(169.724470294466,
167.674371713068, 169.70250974042, 169.444134892323))), .Names = c("x",
"y", "w", "z", "res"), row.names = c(NA, -3L), class = "data.frame")
But what I really need is repeat each row of df dataframe according to the df$res result as follows:
> df2
# x y w z res
#1 7 100 170 132 117.9881
#2 7 100 170 132 126.2456
#3 7 100 170 132 133.7627
#4 7 100 170 132 135.2322
#5 7 100 170 132 143.5229
#6 7 100 170 132 100.3735
#7 7 100 170 132 114.8287
#8 5 100 170 720 168.8581
#9 5 100 170 720 169.4955
#10 5 100 170 720 169.6461
#11 5 100 170 720 169.8998
#12 5 100 170 720 169.0343
#13 4 100 170 1256 169.7245
#14 4 100 170 1256 167.6744
#15 4 100 170 1256 169.7025
#16 4 100 170 1256 169.4441
How, do I achieve this efficiently? I need to apply this to a big dataframe
df <- data.frame(x=c(7,5,4),y=c(100,100,100),w=c(170,170,170),z=c(132,720,1256))
set.seed(123)
l <- mapply(rtruncnorm,df$x,df$y,df$w,df$z,25)
cbind.data.frame(df[rep(seq_along(l), lengths(l)),],
res = unlist(l))
# x y w z res
# 1 7 100 170 132 117.9881
# 1.1 7 100 170 132 126.2456
# 1.2 7 100 170 132 133.7627
# 1.3 7 100 170 132 135.2322
# 1.4 7 100 170 132 143.5229
# 1.5 7 100 170 132 100.3735
# 1.6 7 100 170 132 114.8287
# 2 5 100 170 720 168.8581
# 2.1 5 100 170 720 169.4955
# 2.2 5 100 170 720 169.6461
# 2.3 5 100 170 720 169.8998
# 2.4 5 100 170 720 169.0343
# 3 4 100 170 1256 169.7245
# 3.1 4 100 170 1256 167.6744
# 3.2 4 100 170 1256 169.7025
# 3.3 4 100 170 1256 169.4441
Try this based on your given df:
df$res <- sapply(df$res, paste0, collapse=",")
do.call(rbind, apply(df, 1, function(x) do.call(expand.grid, strsplit(x, ","))))
# x y w z res
# 1 7 100 170 132 117.988108836195
# 2 7 100 170 132 126.245562762918
# 3 7 100 170 132 133.762709785614
# 4 7 100 170 132 135.232193379024
# 5 7 100 170 132 143.52290514973
# 6 7 100 170 132 100.373469134837
# 7 7 100 170 132 114.828678702662
# 8 5 100 170 720 168.858147661715
# 9 5 100 170 720 169.495493758985
# 10 5 100 170 720 169.646123183828
# 11 5 100 170 720 169.899849943838
# 12 5 100 170 720 169.034333943479
# 13 4 100 170 1256 169.724470294466
# 14 4 100 170 1256 167.674371713068
# 15 4 100 170 1256 169.70250974042
# 16 4 100 170 1256 169.444134892323
Related
I have a square matrix that is like:
A <- c("111","111","111","112","112","113")
B <- c(100,10,20,NA,NA,10)
C <- c(10,20,40,NA,10,20)
D <- c(10,20,NA,NA,40,200)
E <- c(20,20,40,10,10,20)
F <- c(NA,NA,40,100,10,20)
G <- c(10,20,NA,30,10,20)
df <- data.frame(A,B,C,D,E,F,G)
names(df) <- c("Codes","111","111","111","112","112","113")
# Codes 111 111 111 112 112 113
# 1 111 100 10 10 20 NA 10
# 2 111 10 20 20 20 NA 20
# 3 111 20 40 NA 40 40 NA
# 4 112 NA NA NA 10 100 30
# 5 112 NA 10 40 10 10 10
# 6 113 10 20 200 20 20 20
I want to reduce it so that observations with the same row and column names are summed up.
So I want to end up with:
# Codes 111 112 113
# 1 111 230 120 30
# 2 112 50 130 40
# 3 113 230 40 20
I tried to first combine the rows with the same "Codes" number, but I was having a lot of trouble.
In tidyverse
library(tidyverse)
df %>%
pivot_longer(-Codes, values_drop_na = TRUE) %>%
group_by(Codes, name) %>%
summarise(value = sum(value), .groups = 'drop')%>%
pivot_wider()
# A tibble: 3 x 4
Codes `111` `112` `113`
<chr> <dbl> <dbl> <dbl>
1 111 230 120 30
2 112 50 130 40
3 113 230 40 20
One way in base R:
tapply(unlist(df[-1]), list(names(df)[-1][col(df[-1])], df[,1][row(df[-1])]), sum, na.rm = TRUE)
111 112 113
111 230 50 230
112 120 130 40
113 30 40 20
Note that this can be simplified as denoted by #thelatemail to
grp <- expand.grid(df$Codes, names(df)[-1])
tapply(unlist(df[-1]), grp, FUN=sum, na.rm=TRUE)
You can also use `xtabs:
xtabs(vals~., na.omit(cbind(grp, vals = unlist(df[-1]))))
Var2
Var1 111 112 113
111 230 120 30
112 50 130 40
113 230 40 20
When dealing with actual matrices - especially with large ones -, expressing the operation as (sparse) linear algebra should be most efficient.
library(Matrix) ## for sparse matrix operations
idx <- c("111","111","111","112","112","113")
mat <- matrix(c(100,10,20,NA,NA,10,
10,20,40,NA,10,20,
10,20,NA,NA,40,200,
20,20,40,10,10,20,
NA,NA,40,100,10,20,
10,20,NA,30,10,20),
nrow=length(idx),
byrow=TRUE, dimnames=list(idx, idx))
## convert NA's to zero
mat[is.na(mat)] <- 0
## examine matrix
mat
## 111 111 111 112 112 113
## 111 100 10 20 0 0 10
## 111 10 20 40 0 10 20
## 111 10 20 0 0 40 200
## 112 20 20 40 10 10 20
## 112 0 0 40 100 10 20
## 113 10 20 0 30 10 20
## indicator matrix
## converts between "code" and "idx" spaces
M_code_idx <- fac2sparse(idx)
## project to "code_code" space
M_code_idx %*% mat %*% t(M_code_idx)
## 3 x 3 Matrix of class "dgeMatrix"
## 111 112 113
## 111 230 50 230
## 112 120 130 40
## 113 30 40 20
Hi I have a bunch of hydrological data on streamflow(Q) that I want to standardize. Data is stored an a large nested table with a layout like the one below that I need to keep:
Flowtestlist <- list(list("910" = data.frame( Q=c(650, 720, 550, 580, 800)),
"950" = data.frame( Q=c(550, 770, 520, 540, 790))),
list ("910" = data.frame( Q=c(450, 620, 750, 580, 800)),
"950" = data.frame( Q=c(650, 750, 580, 520, 890))))
I have levels [[1]] and [[2]], in reality, I have 9 of them and those are also model numbers. Within each model I have 18 subbasins numbered 910, 950, 1012, 1087 etc (in the example above just two subbasins 910, 950 for simplicity). The subbasins contain data on streamflow (Q).
There's also a lookup table:
test_model <- c(1,1,2,2)
test_subbasin <- c(910,950,910,950)
Q_mean <- c(870,765,823,689)
FlowtestDF <- data.frame(test_model, test_subbasin, Q_mean)
This data frame includes streamflow means (Q_mean) for the reference period for each model and subbasin. I want to take each Q from the nested table and find the matching model number and subbasin in the lookup table and divide it to get the standardized streamflow Q_st.
fun_st <- function(x, y=FlowtestDF) {
x$Q_st <- x$Q/y$Q_mean
x <- x
}
testresult <- lapply(Flowtestlist, lapply, fun_st)
It doesn't work. As I understand the function can't find the appropriate location of the needed number in the lookup table (model and subbasin). How can I make this work, while keeping the nested table structure of the data?
Are you looking for this?
Map(\(x, y) lapply(y[match(x$test_subbasin, names(y))], \(i) i / x$Q_mean),
split(FlowtestDF, FlowtestDF$test_model),
Flowtestlist)
# $`1`
# $`1`$`910`
# Q
# 1 0.7471264
# 2 0.9411765
# 3 0.6321839
# 4 0.7581699
# 5 0.9195402
#
# $`1`$`950`
# Q
# 1 0.6321839
# 2 1.0065359
# 3 0.5977011
# 4 0.7058824
# 5 0.9080460
#
#
# $`2`
# $`2`$`910`
# Q
# 1 0.5467801
# 2 0.8998549
# 3 0.9113001
# 4 0.8417997
# 5 0.9720535
#
# $`2`$`950`
# Q
# 1 0.7897934
# 2 1.0885341
# 3 0.7047388
# 4 0.7547170
# 5 1.0814095
Note: If you're (still) using R<4.1, instead of e.g. \(x, y) use function(x, y).
It is easier to do the processing if you have data in a flat dataframe. If for some reason you have to keep the dataframe in nested structure you may split it again.
library(dplyr)
library(purrr)
map_df(Flowtestlist, ~bind_rows(., .id = 'test_subbasin'), .id = 'test_model') %>%
type.convert(as.is = TRUE) %>%
left_join(FlowtestDF, by = c('test_subbasin', 'test_model')) %>%
mutate(Q_st = Q/Q_mean) %>%
split(.$test_model) %>%
map(~.x %>% select(Q, Q_st) %>% split(.x$test_subbasin))
#$`1`
#$`1`$`910`
# Q Q_st
#1 650 0.7471264
#2 720 0.8275862
#3 550 0.6321839
#4 580 0.6666667
#5 800 0.9195402
#$`1`$`950`
# Q Q_st
#6 550 0.7189542
#7 770 1.0065359
#8 520 0.6797386
#9 540 0.7058824
#10 790 1.0326797
#$`2`
#$`2`$`910`
# Q Q_st
#11 450 0.5467801
#12 620 0.7533414
#13 750 0.9113001
#14 580 0.7047388
#15 800 0.9720535
#$`2`$`950`
# Q Q_st
#16 650 0.9433962
#17 750 1.0885341
#18 580 0.8417997
#19 520 0.7547170
#20 890 1.2917271
library(tidyr)
extr <- function(x){
a <- data.frame(x)
names(a) <- names(x)
a$test_model <- parent.frame()$i
a <- pivot_longer(a,setdiff(names(a),'test_model'),names_to = 'test_subbasin',values_to = 'Q')
a
}
to_df <- lapply(Flowtestlist,extr)
df <- do.call(rbind,to_df)
with_lookup <- merge(df,FlowtestDF,by =c('test_model','test_subbasin'))
with_lookup$Q_st <- with_lookup$Q/with_lookup$Q_mean
with_lookup
output;
test_model test_subbasin Q Q_mean Q_st
<int> <chr> <dbl> <dbl> <dbl>
1 1 910 650 870 0.747
2 1 910 720 870 0.828
3 1 910 550 870 0.632
4 1 910 580 870 0.667
5 1 910 800 870 0.920
6 1 950 550 765 0.719
7 1 950 770 765 1.01
8 1 950 520 765 0.680
9 1 950 540 765 0.706
10 1 950 790 765 1.03
11 2 910 450 823 0.547
12 2 910 620 823 0.753
13 2 910 750 823 0.911
14 2 910 580 823 0.705
15 2 910 800 823 0.972
16 2 950 650 689 0.943
17 2 950 750 689 1.09
18 2 950 580 689 0.842
19 2 950 520 689 0.755
20 2 950 890 689 1.29
The following will derive the required output ...
df <- data.frame(test_subbasin = unlist(Flowtestlist), ref = names(unlist(Flowtestlist)))
df$Q_st <- df$test_subbasin / FlowtestDF$Q_mean[match(gsub("\\..*", "", df$ref), FlowtestDF$test_subbasin)]
df
# test_subbasin ref Q_st
# 1 650 910.Q1 0.7471264
# 2 720 910.Q2 0.8275862
# 3 550 910.Q3 0.6321839
# 4 580 910.Q4 0.6666667
# 5 800 910.Q5 0.9195402
# 6 550 950.Q1 0.7189542
# 7 770 950.Q2 1.0065359
# 8 520 950.Q3 0.6797386
# 9 540 950.Q4 0.7058824
# 10 790 950.Q5 1.0326797
# 11 450 910.Q1 0.5172414
# 12 620 910.Q2 0.7126437
# 13 750 910.Q3 0.8620690
# 14 580 910.Q4 0.6666667
# 15 800 910.Q5 0.9195402
# 16 650 950.Q1 0.8496732
# 17 750 950.Q2 0.9803922
# 18 580 950.Q3 0.7581699
# 19 520 950.Q4 0.6797386
# 20 890 950.Q5 1.1633987
Hello I have a table like so:
Entry TimeOn TimeOff Alarm
1 60 70 355
2 80 85 455
3 100 150 400
4 105 120 320
5 125 130 254
6 135 155 220
7 160 170 850
I would like to understand how i can group those entries so the ones starting during another alarm and ending either during another alarm or after the other alarm such as entries 4,5 & 6 can be filtered out of the data frame?
so this would be the desired result a dataframe that looked like this:
Entry TimeOn TimeOff Alarm
1 60 70 355
2 80 85 455
3 100 150 400
7 160 170 850
so entries 4, 5 and 6 removed
library(dplyr)
library(data.table)
df$flag <- apply(df, 1, function(x) {
nrow(filter(df, data.table::between(x['TimeOn'],df$TimeOn,df$TimeOff)))
})
df[df$flag > 1, ]
Entry TimeOn TimeOff Alarm flag
4 4 105 120 320 2
5 5 125 130 254 2
6 6 135 155 220 2
#Save option using Base R
df$flag <- apply(df,1,function(x) {nrow(df[x['TimeOn'] >= df$TimeOn & x['TimeOn'] <= df$TimeOff,])})
Suggested by #Andre Elrico
df[apply(df, 1, function(x) { nrow( df[between(x[['TimeOn']],df$TimeOn,df$TimeOff),] ) > 1 }),]
data
df <- read.table(text="
Entry TimeOn TimeOff Alarm
1 60 70 355
2 80 85 455
3 100 150 400
4 105 120 320
5 125 130 254
6 135 155 220
7 160 170 850
",header=T)
I have data frame like this
test <- data.frame(gr=rep(letters[1:2],each=6),No=c(100:105,200:205))
gr No
1 a 100
2 a 101
3 a 102
4 a 103
5 a 104
6 a 105
7 b 200
8 b 201
9 b 202
10 b 203
11 b 204
12 b 205
in the No column the numbers are increasing in each gr. I need to sum gr a with 100 and b with 50 and need to have consecutive decrease after this operation.
I would like to have a new column that consecutive decrease with this increase. So I tried
decrese_func <- function(No,gr){
if(any(gr=="a")){
No+100
}
else
No+50
}
test%>%
group_by(gr)%>%
mutate(new_column=decrese_func(No,gr))
# A tibble: 12 x 3
# Groups: gr [2]
gr No new_column
<fct> <int> <dbl>
1 a 100 200
2 a 101 201
3 a 102 202
4 a 103 203
5 a 104 204
6 a 105 205
7 b 200 250
8 b 201 251
9 b 202 252
10 b 203 253
11 b 204 254
12 b 205 255
but what I need is like this
gr No new_column
<fct> <int> <dbl>
1 a 100 200
2 a 101 199
3 a 102 198
4 a 103 197
5 a 104 196
6 a 105 195
7 b 200 250
8 b 201 249
9 b 202 248
10 b 203 247
11 b 204 246
12 b 205 245
I cannot figure it out how to have consecutive decrease ?
Thx.
Not the most elegant answer but in the mean time, this may work:
library(dplyr)
test %>%
mutate(A = case_when(gr == "a" ~ 100,
gr == "b" ~ 50,
TRUE ~ NA_real_)) %>%
group_by(gr) %>%
mutate(B = (1:NROW(gr) - 1) * 2,
New_Column = No + A - B)
# A tibble: 12 x 5
# Groups: gr [2]
gr No A B New_Column
<fct> <int> <dbl> <dbl> <dbl>
1 a 100 100 0 200
2 a 101 100 2 199
3 a 102 100 4 198
4 a 103 100 6 197
5 a 104 100 8 196
6 a 105 100 10 195
7 b 200 50 0 250
8 b 201 50 2 249
9 b 202 50 4 248
10 b 203 50 6 247
11 b 204 50 8 246
12 b 205 50 10 245
Add select(gr, No, New_Column) at the end of the chain to get gr, No and New_Column only. I left the other columns just to show what's going on.
And if you want to wrap it into a function you could do something like:
desc_func <- function(group_var, condition, if_true_add, if_false_add, to_number) {
ifelse(
group_var == condition,
to_number + if_true_add - (1:NROW(group_var) - 1) * 2,
to_number + if_false_add - (1:NROW(group_var) - 1) * 2)
}
test %>%
group_by(gr) %>%
mutate(test_var = desc_func(gr, "a", 100, 50, No))
# A tibble: 12 x 3
# Groups: gr [2]
gr No test_var
<fct> <int> <dbl>
1 a 100 200
2 a 101 199
3 a 102 198
4 a 103 197
5 a 104 196
6 a 105 195
7 b 200 250
8 b 201 249
9 b 202 248
10 b 203 247
11 b 204 246
12 b 205 245
Here is a way to do this in base R
test$New <- with(test, No + c(100, 50)[cumsum(!duplicated(gr))] - 2*(No %% 100))
test$New
#[1] 200 199 198 197 196 195 250 249 248 247 246 245
Or a slight variation with match
with(test, No + c(100, 50)[match(gr, unique(gr))] - 2*(No %% 100))
I am trying to take the following data, and then uses this data to create a table which has the information broken down by state.
Here's the data:
> head(mydf2, 10)
lead_id buyer_account_id amount state
1 52055267 62 300 CA
2 52055267 64 264 CA
3 52055305 64 152 CA
4 52057682 62 75 NJ
5 52060519 62 750 OR
6 52060519 64 574 OR
15 52065951 64 152 TN
17 52066749 62 600 CO
18 52062751 64 167 OR
20 52071186 64 925 MN
I've allready subset the states that I'm interested in and have just the data I'm interested in:
mydf2 = subset(mydf, state %in% c("NV","AL","OR","CO","TN","SC","MN","NJ","KY","CA"))
Here's an idea of what I'm looking for:
State Amount Count
NV 1 50
NV 2 35
NV 3 20
NV 4 15
AL 1 10
AL 2 6
AL 3 4
AL 4 1
...
For each state, I'm trying to find a count for each amount "level." I don't necessary need to group the amount variable, but keep in mind that they are are not just 1,2,3, etc
> mydf$amount
[1] 300 264 152 75 750 574 113 152 750 152 675 489 188 263 152 152 600 167 34 925 375 156 675 152 488 204 152 152
[29] 600 489 488 75 152 152 489 222 563 215 452 152 152 75 100 113 152 150 152 150 152 452 150 152 152 225 600 620
[57] 113 152 150 152 152 152 152 152 152 152 640 236 152 480 152 152 200 152 560 152 240 222 152 152 120 257 152 400
Is there an elegant solution for this in R for this or will I be stuck using Excel (yuck!).
Here's my understanding of what you're trying to do:
Start with a simple data.frame with 26 states and amounts only ranging from 1 to 50 (which is much more restrictive than what you have in your example, where the range is much higher).
set.seed(1)
mydf <- data.frame(
state = sample(letters, 500, replace = TRUE),
amount = sample(1:50, 500, replace = TRUE)
)
head(mydf)
# state amount
# 1 g 28
# 2 j 35
# 3 o 33
# 4 x 34
# 5 f 24
# 6 x 49
Here's some straightforward tabulation. I've also removed any instances where frequency equals zero, and I've reordered the output by state.
temp1 <- data.frame(table(mydf$state, mydf$amount))
temp1 <- temp1[!temp1$Freq == 0, ]
head(temp1[order(temp1$Var1), ])
# Var1 Var2 Freq
# 79 a 4 1
# 157 a 7 2
# 391 a 16 1
# 417 a 17 1
# 521 a 21 1
# 1041 a 41 1
dim(temp1) # How many rows/cols
# [1] 410 3
Here's a little bit different tabulation. We are tabulating after grouping the "amount" values. Here, I've manually specified the breaks, but you could just as easily let R decide what it thinks is best.
temp2 <- data.frame(table(mydf$state,
cut(mydf$amount,
breaks = c(0, 12.5, 25, 37.5, 50),
include.lowest = TRUE)))
temp2 <- temp2[!temp2$Freq == 0, ]
head(temp2[order(temp2$Var1), ])
# Var1 Var2 Freq
# 1 a [0,12.5] 3
# 27 a (12.5,25] 3
# 79 a (37.5,50] 3
# 2 b [0,12.5] 2
# 28 b (12.5,25] 6
# 54 b (25,37.5] 5
dim(temp2)
# [1] 103 3
I am not sure if I understand correctly (you have two data.frames mydf and mydf2). I'll assume your data is in mydf. Using aggregate:
mydf$count <- 1:nrow(mydf)
aggregate(data = mydf, count ~ amount + state, length)
Is this what you are looking for?
Note: here count is a variable that is created just to get directly the output of the 3rd column as count.
Alternatives with ddply from plyr:
# no need to create a variable called count
ddply(mydf, .(state, amount), summarise, count=length(lead_id))
Here' one could use any column that exists in one's data instead of lead_id. Even state:
ddply(mydf, .(state, amount), summarise, count=length(state))
Or equivalently without using summarise:
ddply(mydf, .(state, amount), function(x) c(count=nrow(x)))