How to create nested for loop for a certain range - r

I am looking to add rows to my data frame over a range of numbers.
I'm not great at loops so take a look if it helps:
k=1
for (i in 1:nrow(Data)){
for (l in 1: Data[i,2]){
for (j in 1: Data[i,5]){
Data[k,]<-Data[i,]
Data[k,3]<-Data[i,3]+j-1
k=k+1}
}
}
Here is a sample dataframe:
Data<- data.frame(matrix(NCOs = 4, nrow = 2))
x <- c("gid","did", "pid","plays")
colnames(Data) <- x
Data[1,]<-c(1,1,2,8)
Data[2,]<-c(1,2,12,6)
Output should have a total of 14 rows and looks like this
1 1 2 8
1 1 3 8
1 1 4 8
.
.
.
.
1 2 12 6
1 2 13 6
.
.
1 2 17 6

Maybe tidyr::complete is a good choice here.
library(dplyr)
library(tidyr)
Data %>%
group_by(gid, did) %>%
complete(pid = seq(pid, pid + plays - 1)) %>%
fill(plays)
# gid did pid plays
# <dbl> <dbl> <dbl> <dbl>
# 1 1 1 2 8
# 2 1 1 3 8
# 3 1 1 4 8
# 4 1 1 5 8
# 5 1 1 6 8
# 6 1 1 7 8
# 7 1 1 8 8
# 8 1 1 9 8
# 9 1 2 12 6
#10 1 2 13 6
#11 1 2 14 6
#12 1 2 15 6
#13 1 2 16 6
#14 1 2 17 6

Related

Apply same function to several data replicates in R

Consider the following data simulation mechanism:
set.seed(1)
simulW <- function(G)
{
# Let G be the number of groups
n<-2*G #Assume 2 individuals per group
i<-rep(1:G, rep(2,G)) # Group index
j<-rep (1:n)
Y<-rbinom(n, 1, 0.5) # binary
data.frame(id=1:n, i,Y)
}
r<-5 #5 replicates
dat1 <- replicate(r, simulW(G = 10 ), simplify=FALSE)
#For example the first data replicate will be
> dat1[[1]]
id i Y
1 1 1 0
2 2 1 1
3 3 2 0
4 4 2 0
5 5 3 0
6 6 3 0
7 7 4 0
8 8 4 1
9 9 5 1
10 10 5 0
The code below can perform group wise (i is the group) sum of Y but by default considers only the first replicate i.e dat1[[1]].
Di<-aggregate( Y, by=list ( i ),FUN=sum) #Sum per group for the first dataset
e<-colSums(Di [ 2 ] ) #Total sum of Y for all groups for dataset 1
> e
x
8
di<-Di [ 2 ] # Groupwise sum for replicate 1
> di
x
1 2
2 2
3 2
4 0
5 2
How can I use the same function to perform the group wise sum for the other replicates.
Maybe something like:
for (m in 1:r )
{
Di[m]<-
e[m]<-
di[m]<-
}
You may use aggregate in lapply -
result <- lapply(dat1, function(x) aggregate(Y~i, x, sum))
result
#[[1]]
# i Y
#1 1 1
#2 2 1
#3 3 0
#4 4 0
#5 5 1
#6 6 1
#7 7 0
#8 8 2
#9 9 1
#10 10 1
#[[2]]
# i Y
#1 1 2
#2 2 2
#3 3 2
#4 4 0
#5 5 2
#6 6 1
#7 7 0
#8 8 0
#9 9 1
#10 10 1
#...
#...
We may use tidyverse
library(purrr)
library(dplyr)
map(dat1, ~ .x %>%
group_by(i) %>%
summarise(Y = sum(Y)))
-output
[[1]]
# A tibble: 10 × 2
i Y
<int> <int>
1 1 0
2 2 2
3 3 1
4 4 2
5 5 1
6 6 0
7 7 1
8 8 1
9 9 2
10 10 1
[[2]]
# A tibble: 10 × 2
i Y
<int> <int>
1 1 1
2 2 1
3 3 0
4 4 0
5 5 1
6 6 1
7 7 0
8 8 2
9 9 1
10 10 1
[[3]]
# A tibble: 10 × 2
i Y
<int> <int>
1 1 2
2 2 2
3 3 2
4 4 0
5 5 2
6 6 1
7 7 0
8 8 0
9 9 1
10 10 1
[[4]]
# A tibble: 10 × 2
i Y
<int> <int>
1 1 1
2 2 0
3 3 1
4 4 1
5 5 1
6 6 1
7 7 0
8 8 1
9 9 1
10 10 2
[[5]]
# A tibble: 10 × 2
i Y
<int> <int>
1 1 1
2 2 0
3 3 1
4 4 1
5 5 0
6 6 0
7 7 2
8 8 2
9 9 0
10 10 2

Retrieve a value by another column criteria in R

i need some help:
i got this df:
df <- data.frame(month = c(1,1,1,1,1,2,2,2,2,2),
day = c(1,2,3,4,5,1,2,3,4,5),
flow = c(2,5,7,8,5,4,6,7,9,2))
month day flow
1 1 1 2
2 1 2 5
3 1 3 7
4 1 4 8
5 1 5 5
6 2 1 4
7 2 2 6
8 2 3 7
9 2 4 9
10 2 5 2
but i want to know the day of min per month:
month day flow dayminflowofthemonth
1 1 1 2 1
2 1 2 5 1
3 1 3 7 1
4 1 4 8 1
5 1 5 5 1
6 2 1 4 5
7 2 2 6 5
8 2 3 7 5
9 2 4 9 5
10 2 5 2 5
this repetition is not a problem, i will use pivot fuction
tks people!
We can use which.min to return the index of 'min'imum 'flow' per group and use that to get the corresponding 'day' to create the column with mutate
library(dplyr)
df <- df %>%
group_by(month) %>%
mutate(dayminflowofthemonth = day[which.min(flow)]) %>%
ungroup
-output
df
# A tibble: 10 x 4
# month day flow dayminflowofthemonth
# <dbl> <dbl> <dbl> <dbl>
# 1 1 1 2 1
# 2 1 2 5 1
# 3 1 3 7 1
# 4 1 4 8 1
# 5 1 5 5 1
# 6 2 1 4 5
# 7 2 2 6 5
# 8 2 3 7 5
# 9 2 4 9 5
#10 2 5 2 5
Another option using indexing inside dplyr pipeline:
library(dplyr)
#Code
newdf <- df %>% group_by(month) %>% mutate(Val=day[flow==min(flow)][1])
Output:
# A tibble: 10 x 4
# Groups: month [2]
month day flow Val
<dbl> <dbl> <dbl> <dbl>
1 1 1 2 1
2 1 2 5 1
3 1 3 7 1
4 1 4 8 1
5 1 5 5 1
6 2 1 4 5
7 2 2 6 5
8 2 3 7 5
9 2 4 9 5
10 2 5 2 5
Here is a base R option using ave
transform(
df,
dayminflowofthemonth = ave(day*(ave(flow,month,FUN = min)==flow),month,FUN = max)
)
which gives
month day flow dayminflowofthemonth
1 1 1 2 1
2 1 2 5 1
3 1 3 7 1
4 1 4 8 1
5 1 5 5 1
6 2 1 4 5
7 2 2 6 5
8 2 3 7 5
9 2 4 9 5
10 2 5 2 5
One more base R approach:
df$dayminflowofthemonth <- by(
df,
df$month,
function(x) x$day[which.min(x$flow)]
)[df$month]

Adding sequence of numbers to the data

Hi I have a data frame like this
df <-data.frame(x=rep(rep(seq(0,3),each=2),2 ),gr=gl(2,8))
x gr
1 0 1
2 0 1
3 1 1
4 1 1
5 2 1
6 2 1
7 3 1
8 3 1
9 0 2
10 0 2
11 1 2
12 1 2
13 2 2
14 2 2
15 3 2
16 3 2
I want to add a new column numbering sequence of numbers when the x value ==0
I tried
library(dplyr)
df%>%
group_by(gr)%>%
mutate(numbering=seq(2,8,2))
Error in mutate_impl(.data, dots) :
Column `numbering` must be length 8 (the group size) or one, not 4
?
Just for side note mutate(numbering=rep(seq(2,8,2),each=2)) would work for this minimal example but for the general case its better to look x value change from 0!
the expected output
x gr numbering
1 0 1 2
2 0 1 2
3 1 1 4
4 1 1 4
5 2 1 6
6 2 1 6
7 3 1 8
8 3 1 8
9 0 2 2
10 0 2 2
11 1 2 4
12 1 2 4
13 2 2 6
14 2 2 6
15 3 2 8
16 3 2 8
Do you mean something like this?
library(tidyverse);
df %>%
group_by(gr) %>%
mutate(numbering = cumsum(c(1, diff(x) != 0)))
## A tibble: 16 x 3
## Groups: gr [2]
# x gr numbering
# <int> <fct> <dbl>
# 1 0 1 1.
# 2 0 1 1.
# 3 1 1 2.
# 4 1 1 2.
# 5 2 1 3.
# 6 2 1 3.
# 7 3 1 4.
# 8 3 1 4.
# 9 0 2 1.
#10 0 2 1.
#11 1 2 2.
#12 1 2 2.
#13 2 2 3.
#14 2 2 3.
#15 3 2 4.
#16 3 2 4.
Or if you must have a numbering sequence 2,4,6,... instead of 1,2,3,... you can do
df %>%
group_by(gr) %>%
mutate(numering = 2 * cumsum(c(1, diff(x) != 0)));
## A tibble: 16 x 3
## Groups: gr [2]
# x gr numering
# <int> <fct> <dbl>
# 1 0 1 2.
# 2 0 1 2.
# 3 1 1 4.
# 4 1 1 4.
# 5 2 1 6.
# 6 2 1 6.
# 7 3 1 8.
# 8 3 1 8.
# 9 0 2 2.
#10 0 2 2.
#11 1 2 4.
#12 1 2 4.
#13 2 2 6.
#14 2 2 6.
#15 3 2 8.
#16 3 2 8.
Here is an option using match to get the index and then pass on the seq values to fill
df %>%
group_by(gr) %>%
mutate(numbering = seq(2, length.out = n()/2, by = 2)[match(x, unique(x))])
# A tibble: 16 x 3
# Groups: gr [2]
# x gr numbering
# <int> <fct> <dbl>
# 1 0 1 2
# 2 0 1 2
# 3 1 1 4
# 4 1 1 4
# 5 2 1 6
# 6 2 1 6
# 7 3 1 8
# 8 3 1 8
# 9 0 2 2
#10 0 2 2
#11 1 2 4
#12 1 2 4
#13 2 2 6
#14 2 2 6
#15 3 2 8
#16 3 2 8

Grouping rows with mutliple conditions across columns, incl. a sorting, in R/dplyr

In the following dataframe, I have 24 points in the 3D space (2 horizontal locations along X and Y, each with 12 vertical values along Z).
I would like to group together the points vertically if:
they have the same val value and
they follow each other along the Z axis (so two 1 separated by another value would not have the same ID).
And this should be done only for the values beyond the 3 first Z values (which automatically get ID = 1, 2 and 3 respectively, the following ones start at 4).
set.seed(50)
library(dplyr)
mydf = data.frame(X = rep(1, 24), Y = rep(1:2, each = 12),
Z = c(sample(1:12,12,replace=F), sample(4:16,12,replace=F)),
val = c(rep(1:3, 8)))
mydf = mydf %>% group_by(X,Y) %>% arrange(X,Y,Z) %>% data.frame()
# X Y Z val
# 1 1 1 1 3 # In this X-Y location, Z starts at 1
# 2 1 1 2 3
# 3 1 1 3 3
# 4 1 1 4 2
# 5 1 1 5 2
# 6 1 1 6 1
# 7 1 1 7 1
# 8 1 1 8 1
# 9 1 1 9 1
# 10 1 1 10 2
# 11 1 1 11 2
# 12 1 1 12 3
# 13 1 2 4 2 # In this X-Y location, Z starts at 4
# [etc (see below)]
Desired output (note for example that lines 4-5 and 10-11 get a different ID):
rle1 = rle(mydf[4:12,]$val)
# Run Length Encoding
# lengths: int [1:4] 2 4 2 1
# values : int [1:4] 2 1 2 3
rle2 = rle(mydf[4:12 + 12,]$val)
# Run Length Encoding
# lengths: int [1:7] 2 1 1 2 1 1 1
# values : int [1:7] 3 1 2 1 3 1 2
mydf$ID = c(1:3, rep(4:(3+length(rle1$lengths)), rle1$lengths),
1:3, rep(4:(3+length(rle2$lengths)), rle2$lengths))
# X Y Z val ID
# 1 1 1 1 3 1
# 2 1 1 2 3 2
# 3 1 1 3 3 3
# 4 1 1 4 2 4
# 5 1 1 5 2 4
# 6 1 1 6 1 5
# 7 1 1 7 1 5
# 8 1 1 8 1 5
# 9 1 1 9 1 5
# 10 1 1 10 2 6
# 11 1 1 11 2 6
# 12 1 1 12 3 7 # In this X-Y location, I have 7 groups in the end
# 13 1 2 4 2 1
# 14 1 2 5 2 2
# 15 1 2 6 3 3
# 16 1 2 7 3 4
# 17 1 2 9 3 4
# 18 1 2 10 1 5
# 19 1 2 11 2 6
# 20 1 2 12 1 7
# 21 1 2 13 1 7
# 22 1 2 14 3 8
# 23 1 2 15 1 9
# 24 1 2 16 2 10 # In this X-Y location, I have 10 groups in the end
How could I perform this more efficiently, or in one line, and why not with dplyr, supposing this applies for many (X,Y) locations and with always the 3 first Z values (which starts at a different value at each location) followed by a location-dependent number of following ID groups?
I was starting with a try to work with a vector from a conditional subset in dplyr, which is wrong:
mydf %>% group_by(X,Y) %>% arrange(X,Y,Z) %>%
mutate(dummy = mean(rle(val)$values))
Error: error in evaluating the argument 'x' in selecting a method for function 'mean': Error in rle(c(1L, 2L, 3L, 1L, 2L, 3L, 3L, 3L, 1L, 1L, 2L, 2L))$function (x, :
invalid subscript type 'closure'
Thanks!
You can use data.table::rleid on val starting from the 4th element and then add an offset of 3, this could simplify the rle calculation;
library(dplyr); library(data.table)
mydf %>%
group_by(X, Y) %>%
mutate(ID = c(1:3, rleid(val[-(1:3)]) + 3)) %>%
as.data.frame() # for print purpose only
# X Y Z val ID
#1 1 1 1 3 1
#2 1 1 2 3 2
#3 1 1 3 3 3
#4 1 1 4 2 4
#5 1 1 5 2 4
#6 1 1 6 1 5
#7 1 1 7 1 5
#8 1 1 8 1 5
#9 1 1 9 1 5
#10 1 1 10 2 6
#11 1 1 11 2 6
#12 1 1 12 3 7
#13 1 2 4 2 1
#14 1 2 5 2 2
#15 1 2 6 3 3
#16 1 2 7 3 4
#17 1 2 9 3 4
#18 1 2 10 1 5
#19 1 2 11 2 6
#20 1 2 12 1 7
#21 1 2 13 1 7
#22 1 2 14 3 8
#23 1 2 15 1 9
#24 1 2 16 2 10
Or without rleid, use cumsum + diff:
mydf %>% group_by(X, Y) %>% mutate(ID = c(1:3, cumsum(c(4, diff(val[-(1:3)]) != 0))))

count positive negative values in column by group

I want to create two variables giving me the total number of positive and negative values by id, hopefully using dplyr.
Example data:
library(dplyr)
set.seed(42)
df <- data.frame (id=rep(1:10,each=10),
ff=rnorm(100, 0,14 ))
> head(df,20)
id ff
1 1 19.1934183
2 1 -7.9057744
3 1 5.0837978
4 1 8.8600765
5 1 5.6597565
6 1 -1.4857432
7 1 21.1613080
8 1 -1.3252265
9 1 28.2579320
10 1 -0.8779974
11 2 18.2681752
12 2 32.0130355
13 2 -19.4440498
14 2 -3.9030427
15 2 -1.8664987
16 2 8.9033056
17 2 -3.9795409
18 2 -37.1903759
19 2 -34.1665370
20 2 18.4815868
the resulting dataset should look like:
> head(df,20)
id ff pos neg
1 1 19.1934183 6 4
2 1 -7.9057744 6 4
3 1 5.0837978 6 4
4 1 8.8600765 6 4
5 1 5.6597565 6 4
6 1 -1.4857432 6 4
7 1 21.1613080 6 4
8 1 -1.3252265 6 4
9 1 28.2579320 6 4
10 1 -0.8779974 6 4
11 2 18.2681752 4 6
12 2 32.0130355 4 6
13 2 -19.4440498 4 6
14 2 -3.9030427 4 6
15 2 -1.8664987 4 6
16 2 8.9033056 4 6
17 2 -3.9795409 4 6
18 2 -37.1903759 4 6
19 2 -34.1665370 4 6
20 2 18.4815868 4 6
I have thought something similar to this will work:
df<-df%>% group_by(id) %>% mutate(pos= nrow(ff>0)) %>% ungroup()
Any help would be great, thanks.
You need sum():
df %>% group_by(id) %>%
mutate(pos = sum(ff>0),
neg = sum(ff<0))
For a fun (and a fast) solution data.table can also be used:
library(data.table)
setDT(df)
df[, ":="(pos = sum(ff > 0), neg = sum(ff < 0)), by = id]
Here's an answer that add the ifelse part of your question:
df <- df %>% group_by(id) %>%
mutate(pos = sum(ff>0), neg = sum(ff<0)) %>%
group_by(id) %>%
mutate(any_neg=ifelse(any(ff < 0), 1, 0))
Output:
> head(df, 20)
Source: local data frame [20 x 5]
Groups: id [2]
id ff pos neg any_neg
<int> <dbl> <int> <int> <dbl>
1 1 19.1934183 6 4 1
2 1 -7.9057744 6 4 1
3 1 5.0837978 6 4 1
4 1 8.8600765 6 4 1
5 1 5.6597565 6 4 1
6 1 -1.4857432 6 4 1
7 1 21.1613080 6 4 1
8 1 -1.3252265 6 4 1
9 1 28.2579320 6 4 1
10 1 -0.8779974 6 4 1
11 2 18.2681752 4 6 1
12 2 32.0130355 4 6 1
13 2 -19.4440498 4 6 1
14 2 -3.9030427 4 6 1
15 2 -1.8664987 4 6 1
16 2 8.9033056 4 6 1
17 2 -3.9795409 4 6 1
18 2 -37.1903759 4 6 1
19 2 -34.1665370 4 6 1
20 2 18.4815868 4 6 1

Resources