r Replace only some table values with values from alternate table - r

This is not a "vlookup-and-fill-down" question.
My source data is excellent at delivering all the data I need, just not in in a usable form. Recent changes in volume mean manually adjusted fixes are no longer feasible.
I have an inventory table and a services table. The inventory report does not contain purchase order data for services or non-inventory items. The services table (naturally) does. They are of course different shapes.
Pseudo-coding would be something to the effect of for every inventory$Item in services$Item, replace inventory$onPO with services$onPO.
Sample Data
inv <- structure(list(Item = c("10100200", "10100201", "10100202", "10100203",
"10100204", "10100205-A", "10100206", "10100207", "10100208",
"10100209", "10100210"), onHand = c(600L, NA, 39L, 0L, NA, NA,
40L, 0L, 0L, 0L, 0L), demand = c(3300L, NA, 40L, 40L, NA, NA,
70L, 126L, 10L, 10L, 250L), onPO = c(2700L, NA, 1L, 40L, NA,
NA, 30L, 126L, 10L, 10L, 250L)), .Names = c("Item", "onHand",
"demand", "onPO"), row.names = c(NA, -11L), class = c("data.table",
"data.frame"))
svc <- structure(list(Item = c("10100201", "10100204", "10100205-A"),
`Rcv'd` = c(0L, 0L, 44L), Backordered = c(20L, 100L, 18L)), .Names = c("Item",
"Rcv'd", "Backordered"), row.names = c(NA, -3L), class = c("data.table",
"data.frame"))

Assuming you want to replace NAs in onPO with values from Backordered here is a solution using dplyr::left_join:
library(dplyr);
left_join(inv, svc) %>%
mutate(onPO = ifelse(is.na(onPO), Backordered, onPO)) %>%
select(-Backordered, -`Rcv'd`);
# Item onHand demand onPO
#1 10100200 600 3300 2700
#2 10100201 NA NA 20
#3 10100202 39 40 1
#4 10100203 0 40 40
#5 10100204 NA NA 100
#6 10100205-A NA NA 18
#7 10100206 40 70 30
#8 10100207 0 126 126
#9 10100208 0 10 10
#10 10100209 0 10 10
#11 10100210 0 250 250
Or a solution in base R using merge:
inv$onPO <- with(merge(inv, svc, all.x = TRUE), ifelse(is.na(onPO), Backordered, onPO))
Or using coalesce instead of ifelse (thanks to #thelatemail):
library(dplyr);
left_join(inv, svc) %>%
mutate(onPO = coalesce(onPO, Backordered)) %>%
select(-Backordered, -`Rcv'd`);

In data.table world, this is an "update-join". Join on "Item" and then update the values in the original set with the values from the new set:
library(data.table)
setDT(inv)
setDT(svc)
inv[svc, on="Item", c("onPO","onHand") := .(i.Backordered, `i.Rcv'd`)]
#inv original table
#svc update table
#on= match on specified variable
# := overwrite onPO with Backordered
# onHand with Rcv'd
# Item onHand demand onPO
# 1: 10100200 600 3300 2700
# 2: 10100201 0 NA 20
# 3: 10100202 39 40 1
# 4: 10100203 0 40 40
# 5: 10100204 0 NA 100
# 6: 10100205-A 44 NA 18
# 7: 10100206 40 70 30
# 8: 10100207 0 126 126
# 9: 10100208 0 10 10
#10: 10100209 0 10 10
#11: 10100210 0 250 250

Starting with the tables:
>inv
Item OnHand Demand OnPO
1: 10100200 600 3300 2700
2: 10100201 NA NA NA
3: 10100202 39 40 1
4: 10100203 0 40 40
5: 10100204 NA NA NA
6: 10100205-A NA NA NA
7: 10100206 40 70 30
8: 10100207 0 126 126
9: 10100208 0 10 10
10: 10100209 0 10 10
11: 10100210 0 250 250
> svc
Item Rcv'd Backordered
1: 10100201 0 20
2: 10100204 0 100
3: 10100205-A 44 18
After far more cursing than I'd like to admit, the simple solution that works on the above test data, and my live data proved to be:
# Insert OnHand and OnPO data from svc
for (i in 1:nrow(inv)) {
if(inv$Item[i] %in% svc$Item) {
x <- which(svc$Item == inv$Item[i])
inv$OnPO[i] <- svc$Backordered[x]
inv$OnHand[i] <- svc$`Rcv'd`[x]
}
else{}
}
# cleanup
inv[is.na(inv)] <- 0
Is there a simpler or more obvious method that I've overlooked?

We could use eat from my package safejoin, and "patch"
the matches from the rhs into the lhs when columns conflict.
We rename Backordered to onPO on the way so the two columns conflict as desired.
# devtools::install_github("moodymudskipper/safejoin")
library(safejoin)
library(dplyr)
eat(inv, svc, onPO = Backordered, .conflict = "patch")
# Item onHand demand onPO
# 1 10100200 600 3300 2700
# 2 10100201 NA NA 20
# 3 10100202 39 40 1
# 4 10100203 0 40 40
# 5 10100204 NA NA 100
# 6 10100205-A NA NA 18
# 7 10100206 40 70 30
# 8 10100207 0 126 126
# 9 10100208 0 10 10
# 10 10100209 0 10 10
# 11 10100210 0 250 250

Related

Fastest way to fread/process "JSON-like" column in data.table?

Here's some sample data I'm working with. DT_IN contains the input format of the data and DT_OUT contains the form that I would like to use. What's the best way to go from DT_IN to DT_OUT?
I've tried strsplit, but did not manage to order the splits to rbind them in the corresponding order. Am open to solutions, maybe Rcpp could help?
library(data.table)
DT_IN <- data.table(
user_id = c(1L, 20L, 4L, 6L, 9L),
latitude = c(-41.3103218, -40.8307381, -37.3932037, -42.7178726, -45.0156822),
longitude = c(174.824554, 172.793106, 175.840637, 170.965454, 168.731186),
parameters = c(
"{\"\"network\"\"=>\"\"Telecom NZ\"\", \"\"accuracy\"\"=>28.659999847412, \"\"internet\"\"=>\"\"4G\"\", \"\"location_age\"\"=>1}",
"{\"\"location_age\"\"=>716}", "{\"\"location_age\"\"=>851}", "{\"\"accuracy\"\"=>14, \"\"location_age\"\"=>1}",
"{\"\"network\"\"=>\"\"VodafoneNZ\"\", \"\"accuracy\"\"=>29, \"\"internet\"\"=>\"\"3G\"\", \"\"location_age\"\"=>31}"
)
)
> DT_IN
user_id latitude longitude parameters
1: 1 -41.31032 174.8246 {""network""=>""Telecom NZ"", ""accuracy""=>28.659999847412, ""internet""=>""4G"", ""location_age""=>1}
2: 20 -40.83074 172.7931 {""location_age""=>716}
3: 4 -37.39320 175.8406 {""location_age""=>851}
4: 6 -42.71787 170.9655 {""accuracy""=>14, ""location_age""=>1}
5: 9 -45.01568 168.7312 {""network""=>""VodafoneNZ"", ""accuracy""=>29, ""internet""=>""3G"", ""location_age""=>31}
DT_OUT <- data.table(
user_id = c(1L, 20L, 4L, 6L, 9L),
latitude = c(-41.3103218, -40.8307381, -37.3932037, -42.7178726, -45.0156822),
longitude = c(174.824554, 172.793106, 175.840637, 170.965454, 168.731186),
network = c('Telecom NZ', NA, NA, NA, 'VodafoneNZ'),
accuracy = c(28.659999847412, NA, NA, 14, 29),
internet = c('4G', NA, NA, NA, '3G'),
location_age = c(1, 716, 851, 1, 31)
)
> DT_OUT
user_id latitude longitude network accuracy internet location_age
1: 1 -41.31032 174.8246 Telecom NZ 28.66 4G 1
2: 20 -40.83074 172.7931 <NA> NA <NA> 716
3: 4 -37.39320 175.8406 <NA> NA <NA> 851
4: 6 -42.71787 170.9655 <NA> 14.00 <NA> 1
5: 9 -45.01568 168.7312 VodafoneNZ 29.00 3G 31
Using the jsonlite package ...
# Convert json like strings to json.
DT_IN[, parameters := gsub("\"\"", "\"", parameters)]
DT_IN[, parameters := gsub("=>", ":", parameters)]
# Stream_in the json and cbind it to existing data.
DT_IN <- cbind(DT_IN, jsonlite::stream_in(textConnection(DT_IN$parameters)))
# Remove `parameters`
DT_IN[, parameters := NULL]
DT_IN
# user_id latitude longitude network accuracy internet location_age
# 1: 1 -41.31032 174.8246 Telecom NZ 28.66 4G 1
# 2: 20 -40.83074 172.7931 <NA> NA <NA> 716
# 3: 4 -37.39320 175.8406 <NA> NA <NA> 851
# 4: 6 -42.71787 170.9655 <NA> 14.00 <NA> 1
# 5: 9 -45.01568 168.7312 VodafoneNZ 29.00 3G 31

Diagonals to rows in data.frame

I have a matrix-like data frame with an additional column denoting time. It contains information on the number of enrolled students in a given school, from grade 5 (column A) to grade 9 (column E).
time A B C D E
1 13 1842 1844 1689 1776 1716
2 14 1898 1785 1807 1617 1679
3 15 2065 1865 1748 1731 1590
4 16 2215 1994 1811 1708 1703
5 17 2174 2122 1903 1765 1699
I need to trace the size of the cohort over time, meaning that I need row-wise information on how many fifth graders from each starting year remained in the school from grades 6 through 9. For example, for the cohort that has begun fifth grade in 2013, I want information on how many remained in sixth grade in 2014, and so on.
Expected output
This is what I would like to end up with:
start.time point.A point.B point.C point.D point.E
1 13 1842 1785 1748 1708 1699
2 14 1898 1865 1811 1765 NA
3 15 2065 1811 1765 NA NA
4 16 2215 1765 NA NA NA
5 17 2174 NA NA NA NA
I have looked at diag() from base.R, but I could only get the the data from the main diagonal. Ideally, I'd like to accomplish this using dplyr syntax and the pipe.
Data
structure(list(time = 13:17, A = c(1842, 1898, 2065, 2215, 2174), B = c(1844, 1785, 1865, 1994, 2122), C = c(1689, 1807, 1748, 1811, 1903), D = c(1776, 1617, 1731, 1708, 1765), E = c(1716, 1679, 1590, 1703, 1699)), class = c("grouped_df", "tbl_df", "tbl", "data.frame"), row.names = c(NA, -5L), vars = "time", drop = TRUE, indices = list(
0L, 1L, 2L, 3L, 4L), group_sizes = c(1L, 1L, 1L, 1L, 1L), biggest_group_size = 1L, labels = structure(list(
time = 13:17), class = "data.frame", row.names = c(NA, -5L), vars = "time", drop = TRUE, .Names = "time"), .Names = c("time", "A", "B", "C", "D", "E"))
Convert the input DF except for the first column to a matrix mat. Then since row(mat) - col(mat) is constant on diagonals split with respect to that creating a list of ts class series in L. We used ts class since we can later cbind them even if they are of different lengths. The diagonals for which row(mat) - col(mat) >= 0 are the only ones we want so pick off those, cbind them together and transpose the result. Then replace all columns in DF except the first with that. No packages are used.
mat <- as.matrix(DF[-1])
L <- lapply(split(mat, row(mat) - col(mat)), ts)
replace(DF, -1, t(do.call("cbind", L[as.numeric(names(L)) >= 0])))
giving:
time A B C D E
1 13 1842 1785 1748 1708 1699
2 14 1898 1865 1811 1765 NA
3 15 2065 1994 1903 NA NA
4 16 2215 2122 NA NA NA
5 17 2174 NA NA NA NA
Since you mentioned dplyr in your question, you could use dplyr::lead to shift the values of columns B to E by 1, 2 etc. respectively, and then bind the result with columns time and A from your original data as follows
library(tidyverse)
bind_cols(df[, 1:2], map2_df(.x = df[, c(3:ncol(df))],
.y = seq_along(df[, 3:ncol(df)]),
.f = ~dplyr::lead(x = .x, n = .y)))
# A tibble: 5 x 6
# Groups: time [5]
# time A B C D E
# <int> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 13 1842 1785 1748 1708 1699
#2 14 1898 1865 1811 1765 NA
#3 15 2065 1994 1903 NA NA
#4 16 2215 2122 NA NA NA
#5 17 2174 NA NA NA NA
Note that your data is grouped by time the way you provided it.
With some grouping and arranging and row_number(), we can do this with dplyr and tidyr, and we don't lose values.
Looks a bit messy, but here I create a 2-dimensional index where the second dimension is inverted. When these index positions are summed, we get a matching value for diagonal rows.
data %>%
ungroup() %>%
mutate(row = row_number()) %>%
gather(class, stud, A:E) %>%
arrange(row, desc(class)) %>%
group_by(row) %>%
mutate(time_left = row_number()) %>%
ungroup() %>%
transmute(time, class, stud, start_year = time_left + row - 1) %>%
ggplot(aes(time, stud, color = factor(start_year))) +
geom_line() +
geom_point()
Replace the mirrored upper triangle of "d" with the values from the lower triangle.
m <- as.matrix(d[-1])
d[-1] <- NA
d[-1][upper.tri(m, diag = TRUE)[ , ncol(m):1]] <- m[lower.tri(m, diag = TRUE)]
# time A B C D E
# 1 13 1842 1785 1748 1708 1699
# 2 14 1898 1865 1811 1765 NA
# 3 15 2065 1994 1903 NA NA
# 4 16 2215 2122 NA NA NA
# 5 17 2174 NA NA NA NA

Choosing different amount of elements from each group in R

I am working on the Kaggle Instacart competition, but I am quite new to R and have run into something I can not figure out.
I have a dataset with 4 columns. The first column is an order ID (id1). The second column is a product ID (id2). The third column is the probability that I want select the product id2 from the order id1 which we can consider just as a ranking, so a higher probability is always selected over a smaller probability. Finally, the fourth column is the amount of products I want to select from the given order (a feature of the order). So for example, I have here the first 12 rows of the dataframe df:
id1 id2 prob num
1 17 13107 0.4756982 3
2 17 21463 0.3724126 3
3 17 38777 0.3534422 3
4 17 21709 0.3364623 3
5 17 47766 0.3364623 3
6 17 39275 0.3165896 3
7 34 16083 0.4093785 4
8 34 39475 0.3892882 4
9 34 47766 0.3892882 4
10 34 2596 0.3837562 4
11 34 21137 0.3762758 4
12 34 47792 0.3737032 4
We can see that from the id1 = 17 I want to choose 3 elements, and for id1 = 34 I want to choose 4 elements. The result should then be
ID1 ID2
17 13107, 21463, 38777
34 16083, 39475, 47766, 2596
or something similar to this.
At the moment I have tried using
df %>% group_by(id1) %>% top_n(n = num)
but I get the error
Selecting by num
Error in is_scalar_integerish(n) : object 'num' not found
Anyone know how I would go about doing this?
Thanks
You can pipe the grouped data directly into a summarise statement:
df %>% group_by(id1) %>% summarise(id2 = toString(id2[seq_len(first(num))]))
## A tibble: 2 x 2
# id1 id2
# <int> <chr>
#1 17 13107, 21463, 38777
#2 34 16083, 39475, 47766, 2596
In this statement, the id2[seq_len(first(num))] is used to extract the first num per group, create a sequence from 1 to the num and that sequence is used to subset the first X id2 values.
The toString creates a string per id1 group.
Here's another base R option using aggregate:
aggregate(id2 ~ id1, FUN=toString, subset(df, ave(id1, id1, FUN=seq_along) <= num))
# id1 id2
#1 17 13107, 21463, 38777
#2 34 16083, 39475, 47766, 2596
Please note that I assumed the data was already orderd (as in the example) by decreasing probability.
In base R, you can use Map on the list of data frames split by ID with split to apply head to select the respective number of rows for each ID. The number of selected rows is supplied by feeding tapply the column of interest and selecting the first value with head. A data.frame with the corresponding rows is returned using do.call with rbind.
do.call(rbind, Map(head, split(dat, dat$id1), tapply(dat$num, dat$id1, head, 1)))
id1 id2 prob num
17.1 17 13107 0.4756982 3
17.2 17 21463 0.3724126 3
17.3 17 38777 0.3534422 3
34.7 34 16083 0.4093785 4
34.8 34 39475 0.3892882 4
34.9 34 47766 0.3892882 4
34.10 34 2596 0.3837562 4
It is a bit simpler to return a named list of the first dat$num elements where then names in the list correspond to the id1.
Map(head, split(dat$id2, dat$id1), tapply(dat$num, dat$id1, head, 1))
$`17`
[1] 13107 21463 38777
$`34`
[1] 16083 39475 47766 2596
data
dat <-
structure(list(id1 = c(17L, 17L, 17L, 17L, 17L, 17L, 34L, 34L,
34L, 34L, 34L, 34L), id2 = c(13107L, 21463L, 38777L, 21709L,
47766L, 39275L, 16083L, 39475L, 47766L, 2596L, 21137L, 47792L
), prob = c(0.4756982, 0.3724126, 0.3534422, 0.3364623, 0.3364623,
0.3165896, 0.4093785, 0.3892882, 0.3892882, 0.3837562, 0.3762758,
0.3737032), num = c(3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L,
4L)), .Names = c("id1", "id2", "prob", "num"), class = "data.frame", row.names = c("1",
"2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12"))
Having one row per ID may seem nice, but a list column often ends up being a pain to work with; it's not "tidy." Here's a simple dplyr pipeline that sticks to the verbs that make sense: separate by group, filter rows, put back together.
df %>%
group_by(id1) %>%
filter(seq_along(num) <= num) %>%
ungroup() %>%
select(id1, id2)
# A tibble: 7 x 2
id1 id2
<int> <int>
1 17 13107
2 17 21463
3 17 38777
4 34 16083
5 34 39475
6 34 47766
7 34 2596
You can try this by using #lmo's data
dat%>%group_by(id1)%>%arrange(-prob)%>%dplyr::summarise(ID2=paste(id2[1:unique(num)],collapse=","))
With data.table:
library(data.table)
setDT(df)[order(-prob), .(id2 = toString(head(id2, first(num)))), by = id1]
id1 id2
1: 17 13107, 21463, 38777
2: 34 16083, 39475, 47766, 2596
Here, df is coerced to data.table, ordered by decreasing probability. For each group in id1, the num topmost values are picked and aggregated into one string.
This returns id2 as character. If it is required to continue processing, it might be useful to keep the id2 values separate in a list:
setDT(df)[order(-prob), .(id2 = list(head(id2, first(num)))), by = id1]
Data
df <- fread(
"rn id1 id2 prob num
1 17 13107 0.4756982 3
2 17 21463 0.3724126 3
3 17 38777 0.3534422 3
4 17 21709 0.3364623 3
5 17 47766 0.3364623 3
6 17 39275 0.3165896 3
7 34 16083 0.4093785 4
8 34 39475 0.3892882 4
9 34 47766 0.3892882 4
10 34 2596 0.3837562 4
11 34 21137 0.3762758 4
12 34 47792 0.3737032 4")

R: Split Variable Column into multiple (unbalanced) columns by comma

I have a dataset of 25 variables and over 2 million observations. One of my variables is a combination of a few different "categories" that I want to split to where it shows 1 category per column (similar to what split would do in stata). For example:
# Name Age Number Events First
# Karen 24 8 Triathlon/IM,Marathon,10k,5k 0
# Kurt 39 2 Half-Marathon,10k 0
# Leah 18 0 1
And I want it to look like:
# Name Age Number Events_1 Event_2 Events_3 Events_4 First
# Karen 24 8 Triathlon/IM Marathon 10k 5k 0
# Kurt 39 2 Half-Marathon 10k NA NA 0
# Leah 18 0 NA NA NA NA 1
I have looked through stackoverflow but have not found anything that works (everything gives me an error of some sort). Any suggestions would be greatly appreciated.
Note: May not be important but the largest number of categories 1 person has is 19 therefore I would need to create Event_1:Event_19
Comment: Previous stack overflows have suggested the separate function, however this function does not seem to work with my dataset. When I input the function the program runs but when it is finished nothing is changed, there is no output, and no error code. When I tried to use other suggestions made in other threads I received error messages. However, I finally got it is work by using the cSplit function. Thank for the help!!!
From Ananda's splitstackshape package:
cSplit(df, "Events", sep=",")
# Name Age Number First Events_1 Events_2 Events_3 Events_4
#1: Karen 24 8 0 Triathlon/IM Marathon 10k 5k
#2: Kurt 39 2 0 Half-Marathon 10k NA NA
#3: Leah 18 0 1 NA NA NA NA
Or with tidyr:
separate(df, 'Events', paste("Events", 1:4, sep="_"), sep=",", extra="drop")
# Name Age Number Events_1 Events_2 Events_3 Events_4 First
#1 Karen 24 8 Triathlon/IM Marathon 10k 5k 0
#2 Kurt 39 2 Half-Marathon 10k <NA> <NA> 0
#3 Leah 18 0 NA <NA> <NA> <NA> 1
With the data.table package:
setDT(df)[,paste0("Events_", 1:4) := tstrsplit(Events, ",")][,-"Events", with=F]
# Name Age Number First Events_1 Events_2 Events_3 Events_4
#1: Karen 24 8 0 Triathlon/IM Marathon 10k 5k
#2: Kurt 39 2 0 Half-Marathon 10k NA NA
#3: Leah 18 0 1 NA NA NA NA
Data
df <- structure(list(Name = structure(1:3, .Label = c("Karen", "Kurt",
"Leah "), class = "factor"), Age = c(24L, 39L, 18L), Number = c(8L,
2L, 0L), Events = structure(c(3L, 2L, 1L), .Label = c(" NA",
" Half-Marathon,10k", " Triathlon/IM,Marathon,10k,5k"
), class = "factor"), First = c(0L, 0L, 1L)), .Names = c("Name",
"Age", "Number", "Events", "First"), class = "data.frame", row.names = c(NA,
-3L))

Subtraction on different rows and columns and separated by group

I really hate to ask two questions in a row but this is something that I can’t wrap my head around. So let’s say I have a data frame, as follows:
df
Row# User Morning Evening Measure Date
1 1 NA NA 2/18/11
2 1 50 115 2/19/11
3 1 85 128 2/20/11
4 1 62 NA 2/25/11
5 1 48 100.8 3/8/11
6 1 19 71 3/9/11
7 1 25 98 3/10/11
8 1 NA 105 3/11/11
9 2 48 105 2/18/11
10 2 28 203 2/19/11
11 2 35 80.99 2/21/11
12 2 91 78.25 2/22/11
Is it possible in R to take the difference between the previous consecutive day (and only the previous day, not the previous result) evening value of 1 row and the morning value of a different row for each user group? So my desired results would be this.
df
Row# User Morning Evening Date Difference
1 1 NA NA 2/18/11 NA
2 1 50 115 2/19/11 NA
3 1 85 129 2/20/11 30
4 1 62 NA 2/25/11 NA
5 1 48 100.8 3/8/11 NA
6 1 19 71 3/9/11 81.8
7 1 25 98 3/10/11 46
8 1 10 105 3/11/11 88
9 2 48 105 2/18/11 NA
10 2 28 203 2/19/11 77
11 2 35 80.99 2/21/11 NA
12 2 91 78.25 2/22/11 -10.01
All I want this to do is to take the morning value and subtract it from the evening value of the previous consecutive day for each user group. As you can see, some parts of my data frame contain NA values in the morning and evening columns, in addition, not all of the dates are in consecutive order for each different user, so naturally, NA should be assigned.
I've tried searching google but there wasn't much information on being able to apply functions to different rows for each group of rows on different columns (if that makes any sense).
My attempts include many variations of this.
df$Difference<-ave((df$Morning,df$Evening),
df$User,
FUN=function(x){
c('NA',diff(df$Evening-df$Morning)),na.rm=T
})
Again, any help would be greatly appreciated. Thanks.
Note: The input data you show and the output data are not the same. There is a NA which is replaced by 10 in output and the last date is 2/14/11 in input and 2/22/11 in output.
I've assumed the output to be the original data to create this answer to match your result.
df$Diff <- c(NA, head(df$Evening, -1) - tail(df$Morning, -1))
df$Diff[which(c(0, diff(as.Date(as.character(df$Measure_Date),
format="%m/%d/%Y"))) != 1)] <- NA
> df
# Row User Morning Evening Measure_Date Diff
# 1 1 1 NA NA 2/18/11 NA
# 2 2 1 50 115.00 2/19/11 NA
# 3 3 1 85 128.00 2/20/11 30.00
# 4 4 1 62 NA 2/25/11 NA
# 5 5 1 48 100.80 3/8/11 NA
# 6 6 1 19 71.00 3/9/11 81.80
# 7 7 1 25 98.00 3/10/11 46.00
# 8 8 1 10 105.00 3/11/11 88.00
# 9 9 2 48 105.00 2/18/11 NA
# 10 10 2 28 203.00 2/19/11 77.00
# 11 11 2 35 80.99 2/21/11 NA
# 12 12 2 91 78.25 2/22/11 -10.01
#user1342086's edit (that got rejected, but was right indeed):
df$Diff[which(diff(df$User) != 0)] <- NA
seems to take care of the grouping by "User".
A blind first shot (untested). Relies on the data frame being already sorted by User and Date.
#if necessary, transform your dates from factor to Date
df$Date <- as.Date(levels(df$Date)[df$Date],format="%m/%d/%y")
df <- within(df,
Difference <- ifelse(c(NA,diff(Measure_Date)) == 1 & diff(User) == 0,
c(NA,head(Evening,-1)) - Morning, NA
)
)
I used plyr, so be sure you have it installed. This solution should work even if user data are mixed (i.e. not in consecutive rows) and dates are not in chronological order.
# Your example data, as you should post it for us to use
df <-
structure(list(User = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L), Morning = c(NA, 50L, 85L, 62L, 48L, 19L, 25L, NA, 48L,
28L, 35L, 91L), Evening = c(NA, 115, 128, NA, 100.8, 71, 98,
105, 105, 203, 80.99, 78.25), Measure_Date = structure(c(1L,
2L, 3L, 5L, 9L, 10L, 6L, 7L, 1L, 2L, 4L, 8L), .Label = c("2/18/11",
"2/19/11", "2/20/11", "2/21/11", "2/25/11", "3/10/11", "3/11/11",
"3/14/11", "3/8/11", "3/9/11"), class = "factor")), .Names = c("User",
"Morning", "Evening", "Measure_Date"), class = "data.frame", row.names = c(NA,
-12L))
# As already stated by Arun, you need the date as class Date
df$Measure_Date <- as.Date(df$Measure_Date, format='%m/%d/%y')
# Use plyr to procces the dataframe by user
library(package=plyr)
ddply(.data=df, .variables='User',
.fun=function(x){
# Complete sequence of dates for each user
tdf <- data.frame(Measure_Date=seq(from=min(x$Measure_Date),
to=max(x$Measure_Date),
by='1 day'))
# Merge to fill in NAs for unused dates
tdf <- merge(tdf, x, all=TRUE)
# Put desired values side by side
tdf$Evening <- c(NA, tdf$Evening[-length(tdf$Evening)])
# Diference
tdf$Difference <- tdf$Evening - tdf$Morning
# Return desired value to original data
tdf <- tdf[,c('Measure_Date', 'Difference')]
x <- merge(x, tdf)
x
})

Resources