Row by row application in R [duplicate] - r

I have my data in the form of a data.table given below
structure(list(atp = c(1, 0, 1, 0, 0, 1), len = c(2, NA, 3, NA,
NA, 1), inv = c(593, 823, 668, 640, 593, 745), GU = c(36, 94,
57, 105, 48, 67), RUTL = c(100, NA, 173, NA, NA, 7)), .Names = c("atp",
"len", "inv", "GU", "RUTL"), row.names = c(NA, -6L), class = c("data.table",
"data.frame"), .internal.selfref = <pointer: 0x0000000000320788>)
I need to form 4 new columns csi_begin,csi_end, IRQ and csi_order. the value of csi_begin and csi_end when atp=1 depends directly on inv and gu values.
But when atp is not equal to 1 csi_begin and csi_end depends on inv and gu values and IRQ value of previous row
The value of IRQ depends on csi_order of that row if atp==1 else its 0 and csi_order value depends on two rows previous csi_begin value.
I have written the condition with the help of for loop.
Below is the code given
lostsales<-function(transit)
{
if (transit$atp==1)
{
transit$csi_begin[i]<-(transit$inv)[i]
transit$csi_end[i]<-transit$csi_begin[i]-transit$GU[i]
}
else
{
transit$csi_begin[i]<-(transit$inv)[i]+transit$IRQ[i-1]
transit$csi_end[i]<-transit$csi_begin[i]-transit$GU[i]
}
if (transit$csi_begin[i-2]!= NA)
{
transit$csi_order[i]<-transit$csi_begin[i-2]
}
else
{ transit$csi_order[i]<-0}
if (transit$atp==1)
{
transit$IRQ[i]<-transit$csi_order[i]-transit$RUTL[i]
}
else
{
transit$IRQ[i]<-0
}
}
Can anyone help me how to do efficient looping with data.tables using setkeys? As my data set is very large and I cannot use for loop else the timing would be very high.

Adding the desired outcome to your example would be very helpful, as I'm having trouble following the if/then logic. But I took a stab at it anyway:
library(data.table)
# Example data:
dt <- structure(list(atp = c(1, 0, 1, 0, 0, 1), len = c(2, NA, 3, NA, NA, 1), inv = c(593, 823, 668, 640, 593, 745), GU = c(36, 94, 57, 105, 48, 67), RUTL = c(100, NA, 173, NA, NA, 7)), .Names = c("atp", "len", "inv", "GU", "RUTL"), row.names = c(NA, -6L), class = c("data.table", "data.frame"), .internal.selfref = "<pointer: 0x0000000000320788>")
# Add a row number:
dt[,rn:=.I]
# Use this function to get the value from a previous (shiftLen is negative) or future (shiftLen is positive) row:
rowShift <- function(x, shiftLen = 1L) {
r <- (1L + shiftLen):(length(x) + shiftLen)
r[r<1] <- NA
return(x[r])
}
# My attempt to follow the seemingly circular if/then rules:
lostsales2 <- function(transit) {
# If atp==1, set csi_begin to inv and csi_end to csi_begin - GU:
transit[atp==1, `:=`(csi_begin=inv, csi_end=inv-GU)]
# Set csi_order to the value of csi_begin from two rows prior:
transit[, csi_order:=rowShift(csi_begin,-2)]
# Set csi_order to 0 if csi_begin from two rows prior was NA
transit[is.na(csi_order), csi_order:=0]
# Initialize IRQ to 0
transit[, IRQ:=0]
# If ATP==1, set IRQ to csi_order - RUTL
transit[atp==1, IRQ:=csi_order-RUTL]
# If ATP!=1, set csi_begin to inv + IRQ value from previous row, and csi_end to csi_begin - GU
transit[atp!=1, `:=`(csi_begin=inv+rowShift(IRQ,-1), csi_end=inv+rowShift(IRQ,-1)-GU)]
return(transit)
}
lostsales2(dt)
## atp len inv GU RUTL rn csi_begin csi_end csi_order IRQ
## 1: 1 2 593 36 100 1 593 557 0 -100
## 2: 0 NA 823 94 NA 2 NA NA 0 0
## 3: 1 3 668 57 173 3 668 611 593 420
## 4: 0 NA 640 105 NA 4 640 535 0 0
## 5: 0 NA 593 48 NA 5 593 545 668 0
## 6: 1 1 745 67 7 6 745 678 640 633
Is this output close to what you were expecting?

Related

How to find min and max in dplyr?

I know the sum of points for each person.
I need to know: what is the minimum number of points that a person could have. And what is the maximum number of points that a person could have.
What I have tried:
min_and_max <- dataset %>%
group_by(person) %>%
dplyr::filter(min(sum(points, na.rm = T))) %>%
distinct(person) %>%
pull()
min_and_max
My dataset:
id person points
201 rt99 NA
201 rt99 3
201 rt99 2
202 kt 4
202 kt NA
202 kt NA
203 rr 4
203 rr NA
203 rr NA
204 jk 2
204 jk 2
204 jk NA
322 knm3 5
322 knm3 NA
322 knm3 3
343 kll2 2
343 kll2 1
343 kll2 5
344 kll NA
344 kll 7
344 kll 1
I would suggest this dplyr approach. You have to summarize data like this:
library(tidyverse)
#Code
df %>% group_by(id,person) %>%
summarise(Total=sum(points,na.rm = T),
min=min(points,na.rm = T),
max=max(points,na.rm=T))
Output:
# A tibble: 7 x 5
# Groups: id [7]
id person Total min max
<int> <chr> <int> <int> <int>
1 201 rt99 5 2 3
2 202 kt 4 4 4
3 203 rr 4 4 4
4 204 jk 4 2 2
5 322 knm3 8 3 5
6 343 kll2 8 1 5
7 344 kll 8 1 7
Here is the data.table solution -
dataset[, min_points := min(points, na.rm = T), by = person]
dataset[, max_points := max(points, na.rm = T), by = person]
Since I don't have your data, I cannot test this code, but it should work fine.
The summarize() verb is what you want for this. You don't even need to filter out the NA values first since both min() and max() can have na.rm = TRUE.
library(dplyr)
min_and_max <- dataset %>%
group_by(person) %>%
summarize(min = min(points, na.rm = TRUE),
max = max(points, na.rm = TRUE))
min_and_max
# A tibble: 7 x 3
person min max
<chr> <dbl> <dbl>
1 jk 2 2
2 kll 1 7
3 kll2 1 5
4 knm3 3 5
5 kt 4 4
6 rr 4 4
7 rt99 2 3
dput(dataset)
structure(list(id = c(201, 201, 201, 202, 202, 202, 203, 203,
203, 204, 204, 204, 322, 322, 322, 343, 343, 343, 344, 344, 344
), person = c("rt99", "rt99", "rt99", "kt", "kt", "kt", "rr",
"rr", "rr", "jk", "jk", "jk", "knm3", "knm3", "knm3", "kll2",
"kll2", "kll2", "kll", "kll", "kll"), points = c(NA, 3, 2, 4,
NA, NA, 4, NA, NA, 2, 2, NA, 5, NA, 3, 2, 1, 5, NA, 7, 1)), class = "data.frame", row.names = c(NA,
-21L), spec = structure(list(cols = list(id = structure(list(), class = c("collector_double",
"collector")), person = structure(list(), class = c("collector_character",
"collector")), points = structure(list(), class = c("collector_double",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), skip = 1), class = "col_spec"))

R: calculating new variables to a dataset by applying different formulas

I have a dataframe "data" with the following structure:
structure(list(age = c(45, 4, 32, 45), sex = c(1, 0, 1, 0), height = c(165,
178, 145, 132), weight = c(65, 73, 60, 45)), row.names = c(NA,
-4L), class = c("tbl_df", "tbl", "data.frame"))
And I would like to add to this data.frame two new variables (var1, var2), which should be calculated with the two following formulas:
var1 = age*height + (4 if sex==1 OR 2 if sex==0)
var2 = height*weight + (1 if age>40 or 2 if age=<40)
I have a problem both in adding the two variables to the data frame, both in applying a function (I tried to build a function, but seems that can be applied only to a single value and not to all values from all rows).
Can anyone help me, please?
akrun's suggestion of using Boolean arithmetic is a good one but you could also do simply a Boolean version of your own expression substituting multiplication for the if statements.s (whit mild editing of the "=<" to "<=")
data <- structure(list(age = c(45, 4, 32, 45), sex = c(1, 0, 1, 0), height = c(165, 178, 145, 132), weight = c(65, 73, 60, 45)), row.names = c(NA, -4L), class = c("tbl_df", "tbl", "data.frame"))
data <- within(data, {var1 = age*height + 4*(sex==1) + 2 *(sex==0);
var2 = height*weight + (age>40) + 2 *(age <= 40)})
#----
> data
age sex height weight var2 var1
1 45 1 165 65 10726 7429
2 4 0 178 73 12996 714
3 32 1 145 60 8702 4644
4 45 0 132 45 5941 5942
Since the two sets of conditions are each disjoint, the "non-qualifying" choice terms will each be 0.
the function ifelse() is vector based, so it will apply the conditions to each element in the vector.
df <- structure(list(age = c(45, 4, 32, 45), sex = c(1, 0, 1, 0), height = c(165,
178, 145, 132), weight = c(65, 73, 60, 45)), row.names = c(NA,
-4L), class = c("tbl_df", "tbl", "data.frame"))
df$var1 <- ifelse(df$sex == 1,(df$age * df$height) + 4,(df$age * df$height) + 2)
df$var2 <- ifelse(df$age > 40,(df$weight * df$height) + 1,(df$age * df$height) + 2)
final output
> df
# A tibble: 4 x 6
age sex height weight var1 var2
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 45 1 165 65 7429 10726
2 4 0 178 73 714 714
3 32 1 145 60 4644 4642
4 45 0 132 45 5942 5941
I rather the tool case_when() from dplyr package.
Your original data is:
data <-
structure(
list(age = c(45, 4, 32, 45),
sex = c(1, 0, 1, 0),
height = c(165, 178, 145, 132),
weight = c(65, 73, 60, 45)),
row.names = c(NA, -4L),
class = c("tbl_df", "tbl", "data.frame"))
The new variables are created by:
library(dplyr)
data ->
data %>% mutate(var1 = case_when(sex==1 ~ age*height + 4,
sex==0 ~ age*height + 2),
var2 = case_when(age>40 ~ height*weight + 1,
age<=40 ~ height*weight + 2)
)
The outcome is:
# A tibble: 4 x 6
age sex height weight var1 var2
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 45 1 165 65 7429 10726
2 4 0 178 73 714 12996
3 32 1 145 60 4644 8702
4 45 0 132 45 5942 5941
We convert the logical/binary to numeric index by adding 1 to it and use that to change the values to 2, 4, or just 1, 2 and use that in the calculation
library(dplyr)
data %>%
mutate(var1 = (age * height) + c(2, 4)[sex + 1],
var2 = (height * weight) + (age <= 40)+1)
# A tibble: 4 x 6
# age sex height weight var1 var2
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 45 1 165 65 7429 10726
#2 4 0 178 73 714 12996
#3 32 1 145 60 4644 8702
#4 45 0 132 45 5942 5941

Search rows looking for 2 conditions (OR)

I have the following table, with ordered variables:
table <- data.frame(Ident = c("Id_01", "Id_02", "Id_03", "Id_04", "Id_05", "Id_06"),
X01 = c(NA, 18, 0, 14, 0, NA),
X02 = c(0, 16, 0, 17, 0, 53),
X03 = c(NA, 15, 20, 30, 0, 72),
X04 = c(0, 17, 0, 19, 0, NA),
X05 = c(NA, 29, 21, 23, 0, 73),
X06 = c(0, 36, 22, 19, 0, 55))
Ident X01 X02 X03 X04 X05 X06
Id_01 NA 0 NA 0 NA 0
Id_02 18 16 15 17 29 36
Id_03 0 0 20 0 21 22
Id_04 14 17 30 19 23 19
Id_05 0 0 0 0 0 0
Id_06 NA 53 72 NA 73 55
From a previous question, I have the following code provided from a user here, to search by row for one condition (1st and 2nd position > 0) and returning the position of the ocurrence (name of the variable for the specific position):
apply(table[-1], 1, function(x) {
i1 <- x > 0 & !is.na(x)
names(x)[which(i1[-1] & i1[-length(i1)])[1]]})
I'm looking to add a second condition to the apply code, so the conditions needs to be:
1st and 2nd ocurrence (consecutive) > 0
OR
1st and 3rd ocurrence > 0
Considering this change, the output of the evaluation for the table posted before should be:
For Id_01: never occurs (NA?)
For Id_02: 1st position (X01)
For Id_03: 3rd position (X03)
For Id_04: 1st position (X01)
For Id_05: never occurs (NA?)
For Id_06: 2nd position (X02)
Thanks in advance!
We can use lag and lead from dplyr
library(dplyr)
f1 <- function(x) {
i1 <- x > 0 & !is.na(x)
which((i1 & lag(i1, default = i1[1])) |
(i1 & lead(i1, n = 3, default = i1[1])))[1]
}
n1 <- apply(table[-1], 1, f1)
names(table)[-1][n1]
#[1] NA "X01" "X03" "X01" NA "X02"
Or use pmap
library(purrr)
n1 <- pmap_int(table[-1], ~ c(...) %>%
f1)
names(table)[-1][n1]

R lapply update list of data.tables with list - no such index at level 1

I am trying to update a list of date.tables with a list, that seems like it should work as it does in this example:
set.seed(1965)
dt_lst <- list(dt1 <- data.table(a = rnorm(1:4),
b = c(4,3,2,1)), dt2 <- data.table(c = rnorm(1:5),
d = letters[1:5]))
> dt_lst
[[1]]
a b
1: 0.8428429 4
2: 0.2958355 3
3: -1.0520980 2
4: 0.9628192 1
[[2]]
c d
1: -0.05033855 a
2: -0.94065157 b
3: 1.20459624 c
4: -0.47791557 d
5: -0.30362496 e
Now a list for the update (someone said dt1 was group 1 and dt2 was group2 and group needed to be in the results):
group1 <- list(1,2)
And lapply update:
dt_lst_tst <- lapply(seq_along(dt_lst),
function(x)
dt_lst[[x]][, group:= group1[[x]]])
> dt_lst_tst
[[1]]
a b group
1: 0.8428429 4 1
2: 0.2958355 3 1
3: -1.0520980 2 1
4: 0.9628192 1 1
[[2]]
c d group
1: -0.05033855 a 2
2: -0.94065157 b 2
3: 1.20459624 c 2
4: -0.47791557 d 2
5: -0.30362496 e 2
Perfect, and characteristic of my data where I never know how big a data.table I'll have (nrows) nor which 'group' it is supposed to be in
until after the fact, hence updating.
So now, with a very small amount of my data:
> dput(combine_sub1)
list(structure(list(smp = 1:4, x = c(491, 491, 491, 491), y = c(798,
798, 798, 798)), .Names = c("smp", "x", "y"), class = c("data.table",
"data.frame"), row.names = c(NA, -4L), .internal.selfref = <pointer:
0x2b859d8>),
structure(list(smp = 1:6, x = c(650, 650, 650, 650, 650,
650), y = c(437, 437, 437, 437, 437, 437)), .Names = c("smp",
"x", "y"), class = c("data.table", "data.frame"), row.names = c(NA,
-6L), .internal.selfref = <pointer: 0x2b859d8>), structure(list(
smp = 1:5, x = c(480, 485, 540, 572, 589), y = c(462,
462, 455, 451, 450)), .Names = c("smp", "x", "y"), class =
c("data.table",
"data.frame"), row.names = c(NA, -5L), .internal.selfref = <pointer:
0x2b859d8>))
> combine_sub1
[[1]]
smp x y
1: 1 491 798
2: 2 491 798
3: 3 491 798
4: 4 491 798
[[2]]
smp x y
1: 1 650 437
2: 2 650 437
3: 3 650 437
4: 4 650 437
5: 5 650 437
6: 6 650 437
[[3]]
smp x y
1: 1 480 462
2: 2 485 462
3: 3 540 455
4: 4 572 451
5: 5 589 450
group3_lst <- list(1,2,3)
> group3_lst
[[1]]
[1] 1
[[2]]
[1] 2
[[3]]
[1] 3
And using similar lapply as above:
> combine_sub1_tst <- lapply(seq_along(combine_sub1),
+ function(x)
+ combine_sub1[[x]][ , group := group3_lst[[x]]])
Error in group3_lst[[x]] : no such index at level 1
And I can't figure out why the difference. Any help appreciated.
The problem seems to have been caused by the use of variable x in the function call and it interferes with the x column in the data.tables in group3_lst. Use a difference variable name that's not in those data.tables it will work fine, e.g. use i: combine_sub1_tst <- lapply(seq_along(combine_sub1), function(i) combine_sub1[[i]][ , group := group3_lst[[i]]])

How to assign a value in a data frame based on multiple conditions of another data frame

I have two data frames, one consisting of numerical values called 'esame':
media id_poll fin
1 5.330000e+00 360 1
2 6.833333e-02 361 0
3 0.000000e+00 362 0
4 NA 363 0
5 8.200000e-01 364 0
6 3.416667e-01 365 0
7 0.000000e+00 366 0
8 0.000000e+00 367 0
9 0.000000e+00 368 0
10 NA 369 0
11 6.150000e-01 370 0
12 0.000000e+00 371 0
13 0.000000e+00 372 0
14 NA 373 0
15 0.000000e+00 374 0
16 0.000000e+00 375 0
17 0.000000e+00 376 0
18 1.298333e+00 377 0
And the second one consisting of numerical ranges which I would like to use to check in which range the 'media' field of the first data.frame is.
If it's in the first range I would like to assign "1" to the field "fin" of the first data.frame, if it's in the second I would like to assign "2" and so on.
So here it is the second data.frame with some of the conditions I'll need:
Range1 Range2 Range3 Range4 ID
0.5 9.9 29.9 >30 360
0.5 15.9 49.9 >50 361
0 4.9 24.9 >25 362
First of all I suppose I won't need to declare Range4 as it's already an information included in Range3. I removed the initial value of all numerical ranges as I need just a single number to check against (or so I think). The same row for ID 360 could be written as:
Range1 Range2 Range3 Range4 ID
0.5 0.6-9.9 10-29.9 >30 360
So my guess is to do something like this:
esame$fin<-ifelse (esame$media<0.6 & datofinale$id_poll=="360", "1", "0")
I could substitute the "0" value with another 'ifelse' statement and go on manually.
Is there any faster way to do that? (the list containing all the condititions is actually pretty larger than the example).
Thank you for any advice.
Not too nice, but this should work:
require(dplyr)
inner_join(Data,Data1,by=c("id_poll"="ID")) %>% rowwise() %>%
mutate(fin = findInterval(media,c(-Inf,Range1,Range2,Range3),left.open=TRUE))
Reproducible data
esame <- structure(list(media = c(5.33, 0.06833333, 0, NA, 0.82, 0.3416667,
0, 0, 0, NA, 0.615, 0, 0, NA, 0, 0, 0, 1.298333), id_poll = 360:377,
fin = c(1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L)), .Names = c("media", "id_poll", "fin"
), row.names = c(NA, -18L), class = c("data.table", "data.frame"
), .internal.selfref = <pointer: 0x0000000014320788>)
df1 <- structure(list(Range1 = c(0.5, 0.5, 0), Range2 = c(9.9, 15.9,
4.9), Range3 = c(29.9, 49.9, 24.9), Range4 = c(">30", ">50",
">25"), ID = 360:362), .Names = c("Range1", "Range2", "Range3",
"Range4", "ID"), row.names = c(NA, -3L), class = c("data.table",
"data.frame"), .internal.selfref = <pointer: 0x0000000014320788>)
dplyr solution
Using case_when
library(dplyr)
df2 <- left_join(esame1, df1, by=c("id_poll" = "ID")) %>%
mutate(fin = case_when( media > Range3 ~ 4,
media > Range2 ~ 3,
media > Range1 ~ 2,
media <= Range1 ~ 1,
is.na(Range1) == T ~ 0)) # else case
Output
media ID fin Range1 Range2 Range3 Range4
1 5.33000000 360 2 0.5 9.9 29.9 >30
2 0.06833333 361 1 0.5 15.9 49.9 >50
3 0.00000000 362 1 0.0 4.9 24.9 >25
4 NA 363 0 NA NA NA <NA>
5 0.82000000 364 0 NA NA NA <NA>
We can consider each row in the range data.frame as a vector and ask whether the current media value is greater than the value in this vector.
For simplicity, I'm assuming that all values in the first data.frame has a correspondent in the second, and that they are all ordered the same way.
for(i in 1:nrow(esame)) {
greater.than <- esame[i,1]>range[i,1:3] #this returns a vector of TRUE (greater than this range) and FALSE (within) you want the first FALSE
esame$fin <- max(which(greater.than))+1 #returns the position of the last TRUE +1, which is the position of the first FALSE
}
dat - first df, tad - second. It will put 0 if NA, nested ifelse() and assume that first range is from 0 to present value. However show some example result to check if it works properly.
dat$fin <- sapply(1:nrow(dat), function(x) ifelse(dat[x,1] >= tad[x,1] & !is.na(dat[x,1]), 1, ifelse(dat[x,1] >= tad[x,2] & !is.na(dat[x,1]), 2, ifelse(dat[x,1] >= tad[x,3] & !is.na(dat[x,1]), 3, 0))))
>dat
media id_poll fin
1 5.33000000 360 1
2 0.06833333 361 0
3 0.00000000 362 1

Resources