new vector in r by values of other vector - r

I have data frame of this form:
df <- data.frame(country = rep(x = LETTERS[1:4], each = 5), year = rep(2001:2005), C=runif(20,30,100), Z=rnorm(20, mean = 0, sd = 1))
I would like for each country, to identify value of Z when year==2003, and to divide all values of C by that value, so each country values of C will be divided with some different number but the number will be the same within one country - and to save all these in some new vector "New". So for example, all values in C for country A will be divided with -0.80212515, for country B divided with -0.62305076 etc. How can i do it? Thanks!

Your data does not match with example you shared in your post. You need to use set.seed() to make it reproducible. Anyways, here's a solution using dplyr -
set.seed(42)
df <- data.frame(country = rep(x = LETTERS[1:4], each = 5),
year = rep(2001:2005),
C=runif(20,30,100),
Z=rnorm(20, mean = 0, sd = 1))
df %>%
group_by(country) %>%
mutate(
New = C / Z[year == 2003]
) %>%
pull(New)
# [1] -67.70760 -68.83000 -36.02216 -63.45585 -53.94507 -24.97189 -30.70301
# [8] -14.84183 -28.60558 -29.87234 -360.88226 -467.30510 -555.07518 -278.50602
# [15] -362.73532 -54.33474 -55.85181 -21.67929 -35.87291 -39.26086

Another solution, using base R
Extract Z for 2005 for each country
v1 <- df[df$year==2005,4]
create vector with correct length for division
z_2005 <- rep(x = v1[1:4],each = 5)
new vector <- C divided by Z for appropriate year
new <- df$C / Z_2005
If you want to merge new columns with old dataframe
df2 <- cbind(df,Z_2005,new)

A data.table alternative to #Shree's dplyr:
set.seed(42)
dt <- data.table(country = rep(x = LETTERS[1:4], each = 5), year = rep(2001:2005), C=runif(20,30,100), Z=rnorm(20, mean = 0, sd = 1))
dt[,New := C/Z[year==2003],by="country"]
dt
# country year C Z New
# 1: A 2001 94.03642 1.3048697 -67.70760
# 2: A 2002 95.59528 2.2866454 -68.83000
# 3: A 2003 50.02977 -1.3888607 -36.02216
# 4: A 2004 88.13133 -0.2787888 -63.45585
# 5: A 2005 74.92219 -0.1333213 -53.94507
# 6: B 2001 66.33672 0.6359504 -24.97189
# 7: B 2002 81.56118 -0.2842529 -30.70301
# 8: B 2003 39.42666 -2.6564554 -14.84183
# 9: B 2004 75.98946 -2.4404669 -28.60558
# 10: B 2005 79.35453 1.3201133 -29.87234
# 11: C 2001 62.04192 -0.3066386 -360.88226
# 12: C 2002 80.33786 -1.7813084 -467.30510
# 13: C 2003 95.42706 -0.1719174 -555.07518
# 14: C 2004 47.88002 1.2146747 -278.50602
# 15: C 2005 62.36050 1.8951935 -362.73532
# 16: D 2001 95.80102 -0.4304691 -54.33474
# 17: D 2002 98.47585 -0.2572694 -55.85181
# 18: D 2003 38.22412 -1.7631631 -21.67929
# 19: D 2004 63.24980 0.4600974 -35.87291
# 20: D 2005 69.22329 -0.6399949 -39.26086
And an option that relies on neither data.table nor dplyr:
do.call(rbind,
by(df, df$country, FUN = function(a) transform(a, New = C/Z[year==2003])))

Use split and process each dataset separately, then combine them
r=sapply(split(df, df$country), function(x)New=x$Z/x$Z[x$year==2003])
d=tidyr::gather(as.data.frame(r),Country, New)
Edits
set.seed(0)
df <- data.frame(country = rep(x = LETTERS[1:4], each = 5), year = rep(2001:2005), C=runif(20,30,100), Z=rnorm(20, mean = 0, sd = 1))
r=sapply(split(df, df$country), function(x)New=x$Z/x$Z[x$year==2003])
d=tidyr::gather(as.data.frame(r),country, New)
cbind(df, d)

Related

R Merge Tabulations of data from different files

I am using R to analyize multiple experiments in which the results are stored in multiple CSV files. I run table() to tabulate the data and get results like the following
Tabulations of Combination1.csv
A 1000
B 50
C 200
Tabulations of Combination2.csv
A 25
B 1500
D 30
Tabulations of Combination3.csv
B 19
C 500
E 2000
I want to build a table that combines these tabulations.
Combination A B C D E
c1 1000 50 200 N/A N/A
c2 25 1500 N/A 30 N/A
c3 N/A 19 500 N/A 2000
Here's how I would do it using tidyr and dplyr:
Data
c1 <- rep(LETTERS[1:3], c(1000, 50, 200))
c2 <- rep(LETTERS[c(1:2, 4)], c(25, 1500, 30))
c3 <- rep(LETTERS[c(2:3, 5)], c(19, 500, 2000))
Code
library(tidyr)
library(plyr)
allC <- list(c1 = c1, c2 = c2, c3 = c3)
# get all tables in data.frame format
ldply(names(allC), function(x) {
tab <- table(allC[[x]])
data.frame(Combination = x, element = names(tab), Freq = c(tab))
}) %>% spread(element, Freq)
# Combination A B C D E
# 1 c1 1000 50 200 NA NA
# 2 c2 25 1500 NA 30 NA
# 3 c3 NA 19 500 NA 2000
Explanation
You transform all your tables to a data.frame first, where you append the name of the respective element. Then you use spread to spread out the values.
library(dplyr)
library(tidyr)
x <- table(c(rep("A", 1000), rep("B", 50), rep("C", 200)))
y <- table(c(rep("A", 25), rep("B", 1500), rep("D", 30)))
z <- table(c(rep("B", 19), rep("C", 500), rep("E", 2000)))
X <- data.frame(x) %>% spread(Var1, Freq)
Y <- data.frame(y) %>% spread(Var1, Freq)
Z <- data.frame(z) %>% spread(Var1, Freq)
X %>% full_join(Y) %>% full_join(Z) %>%
mutate(Combination = paste0("c", seq(1,3)))
Result:
> X %>% full_join(Y) %>% full_join(Z) %>%
+ mutate(Combination = paste0("c", seq(1,3)))
Joining, by = c("A", "B")
Joining, by = c("B", "C")
A B C D E Combination
1 1000 50 200 NA NA c1
2 25 1500 NA 30 NA c2
3 NA 19 500 NA 2000 c3
Please think for the next time to provide x, y and z objects for a reproducible example :)

Moving Averages on multiple columns - Grouped Data

Apologies if this has been answered. I've gone through numerous examples today but I can't find any that match what I am trying to do.
I have a data set which I need to calculate a 3 point moving average on. I've generated some dummy data below:
set.seed(1234)
data.frame(Week = rep(seq(1:5), 3),
Section = c(rep("a", 5), rep("b", 5), rep("c", 5)),
Qty = runif(15, min = 100, max = 500),
To = runif(15, min = 40, max = 80))
I want to calculate the MA for each group based on the 'Section' column for both the 'Qty' and the 'To' columns. Ideally the output would be a data table. The moving average would start at Week 3 so would be the average of wks 1:3
I am trying to master the data.table package so a solution using that would be great but otherwise any will be much appreciated.
Just for reference my actual data set will have approx. 70 sections with c.1M rows in total. I've found the data.table to be extremely fast at crunching these kind of volumes so far.
We could use rollmean from the zoo package, in combination with data.table .
library(data.table)
library(zoo)
setDT(df)[, c("Qty.mean","To.mean") := lapply(.SD, rollmean, k = 3, fill = NA, align = "right"),
.SDcols = c("Qty","To"), by = Section]
> df
# Week Section Qty To Qty.mean To.mean
#1: 1 a 145.4814 73.49183 NA NA
#2: 2 a 348.9198 51.44893 NA NA
#3: 3 a 343.7099 50.67283 279.3703 58.53786
#4: 4 a 349.3518 47.46891 347.3271 49.86356
#5: 5 a 444.3662 49.28904 379.1426 49.14359
#6: 1 b 356.1242 52.66450 NA NA
#7: 2 b 103.7983 52.10773 NA NA
#8: 3 b 193.0202 46.36184 217.6476 50.37802
#9: 4 b 366.4335 41.59984 221.0840 46.68980
#10: 5 b 305.7005 48.75198 288.3847 45.57122
#11: 1 c 377.4365 72.42394 NA NA
#12: 2 c 317.9899 61.02790 NA NA
#13: 3 c 213.0934 76.58633 302.8400 70.01272
#14: 4 c 469.3734 73.25380 333.4856 70.28934
#15: 5 c 216.9263 41.83081 299.7977 63.89031
A solution using dplyr:
library(dplyr); library(zoo)
myfun = function(x) rollmean(x, k = 3, fill = NA, align = "right")
df %>% group_by(Section) %>% mutate_each(funs(myfun), Qty, To)
#### Week Section Qty To
#### (int) (fctr) (dbl) (dbl)
#### 1 1 a NA NA
#### 2 2 a NA NA
#### 3 3 a 279.3703 58.53786
#### 4 4 a 347.3271 49.86356
There is currently faster approach using new frollmean function in data.table 1.12.0.
setDT(df)[, c("Qty.mean","To.mean") := frollmean(.SD, 3),
.SDcols = c("Qty","To"),
by = Section]

Sum of two Columns of Data Frame with NA Values

I have a data frame with some NA values. I need the sum of two of the columns. If a value is NA, I need to treat it as zero.
a b c d
1 2 3 4
5 NA 7 8
Column e should be the sum of b and c:
e
5
7
I have tried a lot of things, and done two dozen searches with no luck. It seems like a simple problem. Any help would be appreciated!
dat$e <- rowSums(dat[,c("b", "c")], na.rm=TRUE)
dat
# a b c d e
# 1 1 2 3 4 5
# 2 5 NA 7 8 7
dplyr solution, taken from here:
library(dplyr)
dat %>%
rowwise() %>%
mutate(e = sum(b, c, na.rm = TRUE))
Here is another solution, with concatenated ifelse():
dat$e <- ifelse(is.na(dat$b) & is.na(dat$c), dat$e <-0, ifelse(is.na(dat$b), dat$e <- 0 + dat$c, dat$b + dat$c))
# a b c d e
#1 1 2 3 4 5
#2 5 NA 7 8 7
Edit, here is another solution that uses with as suggested by #kasterma in the comments, this is much more readable and straightforward:
dat$e <- with(dat, ifelse(is.na(b) & is.na(c ), 0, ifelse(is.na(b), 0 + c, b + c)))
if you want to keep NA if both columns has it you can use:
Data, sample:
dt <- data.table(x = sample(c(NA, 1, 2, 3), 100, replace = T), y = sample(c(NA, 1, 2, 3), 100, replace = T))
Solution:
dt[, z := ifelse(is.na(x) & is.na(y), NA_real_, rowSums(.SD, na.rm = T)), .SDcols = c("x", "y")]
(the data.table way)
I hope that it may help you
Some cases you have a few columns that are not numeric. This approach will serve you both.
Note that: c_across() for dplyr version 1.0.0 and later
df <- data.frame(
TEXT = c("text1", "text2"), a = c(1,5), b = c(2, NA), c = c(3,7), d = c(4,8))
df2 <- df %>%
rowwise() %>%
mutate(e = sum(c_across(a:d), na.rm = TRUE))
# A tibble: 2 x 6
# Rowwise:
# TEXT a b c d e
# <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 text1 1 2 3 4 10
# 2 text2 5 NA 7 8 20

Data table operations with multiple group by variable sets

I have a data.table that I would like to perform group-by operations on, but would like to retain the null variables and use different group-by variable sets.
A toy example:
library(data.table)
set.seed(1)
DT <- data.table(
id = sample(c("US", "Other"), 25, replace = TRUE),
loc = sample(LETTERS[1:5], 25, replace = TRUE),
index = runif(25)
)
I would like to find the sum of index by all combinations of the key variables (including the null set). The concept is analogous to "grouping sets" in Oracle SQL, here is an example of my current workaround:
rbind(
DT[, list(id = "", loc = "", sindex = sum(index)), by = NULL],
DT[, list(loc = "", sindex = sum(index)), by = "id"],
DT[, list(id = "", sindex = sum(index)), by = "loc"],
DT[, list(sindex = sum(index)), by = c("id", "loc")]
)[order(id, loc)]
id loc sindex
1: 11.54218399
2: A 2.82172063
3: B 0.98639578
4: C 2.89149433
5: D 3.93292900
6: E 0.90964424
7: Other 6.19514146
8: Other A 1.12107080
9: Other B 0.43809711
10: Other C 2.80724742
11: Other D 1.58392886
12: Other E 0.24479728
13: US 5.34704253
14: US A 1.70064983
15: US B 0.54829867
16: US C 0.08424691
17: US D 2.34900015
18: US E 0.66484697
Is there a preferred "data table" way to accomplish this?
As of this commit, this is now possible with the dev version of data.table with cube or groupingsets:
library("data.table")
# data.table 1.10.5 IN DEVELOPMENT built 2017-08-08 18:31:51 UTC
# The fastest way to learn (by data.table authors): https://www.datacamp.com/courses/data-analysis-the-data-table-way
# Documentation: ?data.table, example(data.table) and browseVignettes("data.table")
# Release notes, videos and slides: http://r-datatable.com
cube(DT, list(sindex = sum(index)), by = c("id", "loc"))
# id loc sindex
# 1: US B 0.54829867
# 2: US A 1.70064983
# 3: Other B 0.43809711
# 4: Other E 0.24479728
# 5: Other C 2.80724742
# 6: Other A 1.12107080
# 7: US E 0.66484697
# 8: US D 2.34900015
# 9: Other D 1.58392886
# 10: US C 0.08424691
# 11: NA B 0.98639578
# 12: NA A 2.82172063
# 13: NA E 0.90964424
# 14: NA C 2.89149433
# 15: NA D 3.93292900
# 16: US NA 5.34704253
# 17: Other NA 6.19514146
# 18: NA NA 11.54218399
groupingsets(DT, j = list(sindex = sum(index)), by = c("id", "loc"), sets = list(character(), "id", "loc", c("id", "loc")))
# id loc sindex
# 1: NA NA 11.54218399
# 2: US NA 5.34704253
# 3: Other NA 6.19514146
# 4: NA B 0.98639578
# 5: NA A 2.82172063
# 6: NA E 0.90964424
# 7: NA C 2.89149433
# 8: NA D 3.93292900
# 9: US B 0.54829867
# 10: US A 1.70064983
# 11: Other B 0.43809711
# 12: Other E 0.24479728
# 13: Other C 2.80724742
# 14: Other A 1.12107080
# 15: US E 0.66484697
# 16: US D 2.34900015
# 17: Other D 1.58392886
# 18: US C 0.08424691
I have a generic function that you can feed in a dataframe and a vector of dimensions you wish to group by, and it will return the sum of all numeric fields grouped by those dimensions.
rollSum = function(input, dimensions){
#cast dimension inputs to character in case a dimension input is numeric
for (x in 1:length(dimensions)){
input[[eval(dimensions[x])]] = as.character(input[[eval(dimensions[x])]])
}
numericColumns = which(lapply(input,class) %in% c("integer", "numeric"))
output = input[,lapply(.SD, sum, na.rm = TRUE), by = eval(dimensions),
.SDcols = numericColumns]
return(output)
}
So then you can create a list of your different group by vectors:
groupings = list(c("id"),c("loc"),c("id","loc"))
And then use it with lapply and rbindlist in the way of:
groupedSets = rbindlist(lapply(groupings, function(x){
return(rollSum(DT,x))}), fill = TRUE)
using dplyr, an adaption of this should work, if I understand your question correctly.
sum <- mtcars %>%
group_by(vs, am) %>%
summarise(Sum=sum(mpg))
I didnt check how it treats the missung values though,but it should just make another group of them (last group).

Cross-correlation with multiple groups in one data.table

I'd like to calculate the cross-correlations between groups of time series within on data.table. I have a time series data in this format:
data = data.table( group = c(rep("a", 5),rep("b",5),rep("c",5)) , Y = rnorm(15) )
group Y
1: a 0.90855520
2: a -0.12463737
3: a -0.45754652
4: a 0.65789709
5: a 1.27632196
6: b 0.98483700
7: b -0.44282527
8: b -0.93169070
9: b -0.21878359
10: b -0.46713392
11: c -0.02199363
12: c -0.67125826
13: c 0.29263953
14: c -0.65064603
15: c -1.41143837
Each group has the same number of observations. What I am looking for is a way to obtain cross correlation between the groups:
group.1 group.2 correlation
a b 0.xxx
a c 0.xxx
b c 0.xxx
I am working on a script to subset each group and append the cross-correlations, but the data size is fairly large. Is there any efficient / zen way to do this?
Does this help?
data[,id:=rep(1:5,3)]
dtw = dcast.data.table(data, id ~ group, value.var="Y" )[, id := NULL]
cor(dtw)
See Correlation between groups in R data.table
Another way would be:
# data
set.seed(45L)
data = data.table( group = c(rep("a", 5),rep("b",5),rep("c",5)) , Y = rnorm(15) )
# method 2
setkey(data, "group")
data2 = data[J(c("b", "c", "a"))][, list(group2=group, Y2=Y)]
data[, c(names(data2)) := data2]
data[, cor(Y, Y2), by=list(group, group2)]
# group group2 V1
# 1: a b -0.2997090
# 2: b c 0.6427463
# 3: c a -0.6922734
And to generalize this "other" way to more than three groups...
data = data.table( group = c(rep("a", 5),rep("b",5),rep("c",5),rep("d",5)) ,
Y = rnorm(20) )
setkey(data, "group")
groups = unique(data$group)
ngroups = length(groups)
library(gtools)
pairs = combinations(ngroups,2,groups)
d1 = data[pairs[,1],,allow.cartesian=TRUE]
d2 = data[pairs[,2],,allow.cartesian=TRUE]
d1[,c("group2","Y2"):=d2]
d1[,cor(Y,Y2), by=list(group,group2)]
# group group2 V1
# 1: a b 0.10742799
# 2: a c 0.52823511
# 3: a d 0.04424170
# 4: b c 0.65407400
# 5: b d 0.32777779
# 6: c d -0.02425053

Resources