multiply columns of data frames - r

I have been scratching my head over this. I have two data frames: df
df <- data.frame(group = 1:3,
age = seq(30, 50, length.out = 3),
income = seq(100, 500, length.out = 3),
assets = seq(500, 800, length.out = 3))
and weights
weights <- data.frame(age = 5, income = 10)
I would like to multiply these two data frames only for the same column names. I tried something like this:
colwise(function(x) {x * weights[names(x)]})(df)
but that obviously didn't work as colwise does not keep the column name inside the function. I looked at various mapply solutions (example), but I am unable to come up with an answer.
The resulting data.frame should look like this:
structure(list(group = 1:3, age = c(150, 200, 250), income = c(1000,
3000, 5000), assets = c(500, 650, 800)), .Names = c("group",
"age", "income", "assets"), row.names = c(NA, -3L), class = "data.frame")
group age income assets
1 1 150 1000 500
2 2 200 3000 650
3 3 250 5000 800

sweep() is your friend here, for this particular example. It relies upon the names in df and weights being in the right order, but that can be arranged.
> nams <- names(weights)
> df[, nams] <- sweep(df[, nams], 2, unlist(weights), "*")
> df
group age income assets
1 1 150 1000 500
2 2 200 3000 650
3 3 250 5000 800
If the variable names in weights and df are not in the same order, you can make them so:
> df2 <- data.frame(group = 1:3,
+ age = seq(30, 50, length.out = 3),
+ income = seq(100, 500, length.out = 3),
+ assets = seq(500, 800, length.out = 3))
> nams <- c("age", "income") ## order in df2
> weights2 <- weights[, rev(nams)]
> weights2 ## wrong order compared to df2
income age
1 10 5
> df2[, nams] <- sweep(df2[, nams], 2, unlist(weights2[, nams]), "*")
> df2
group age income assets
1 1 150 1000 500
2 2 200 3000 650
3 3 250 5000 800
In other words we reorder all objects so that age and income are in the right order.

Someone might have a slick way to do it with plyr, but this is probably the most straight forward way in base R.
shared.names <- intersect(names(df), names(weights))
cols <- sapply(names(df), USE.NAMES=TRUE, simplify=FALSE, FUN=function(name)
if (name %in% shared.names) df[[name]] * weights[[name]] else df[[name]])
data.frame(do.call(cbind, cols))
# group age income assets
# 1 1 150 1000 500
# 2 2 200 3000 650
# 3 3 250 5000 800

Your data:
df <- data.frame(group = 1:3,
age = seq(30, 50, length.out = 3),
income = seq(100, 500, length.out = 3),
assets = seq(500, 800, length.out = 3))
weights <- data.frame(age = 5, income = 10)
The logic:
# Basic name matching looks like this
names(df[names(df) %in% names(weights)])
# [1] "age" "income"
# Use that in `sapply()`
sapply(names(df[names(df) %in% names(weights)]),
function(x) df[[x]] * weights[[x]])
# age income
# [1,] 150 1000
# [2,] 200 3000
# [3,] 250 5000
The implementation:
# Put it all together, replacing the original data
df[names(df) %in% names(weights)] <- sapply(names(df[names(df) %in% names(weights)]),
function(x) df[[x]] * weights[[x]])
The result:
df
# group age income assets
# 1 1 150 1000 500
# 2 2 200 3000 650
# 3 3 250 5000 800

Here is a data.table solution
library(data.table)
DT <- data.table(df)
W <- data.table(weights)
Use mapply (or Map) to calculate the new columns and add then both at once
by reference.
DT <- data.table(df)
W <- data.table(weights)
DT[, `:=`(names(W), Map('*', DT[,names(W), with = F], W)), with = F]

You could also do this in a for loop using an index resulting from which(%in%). The above approach is much more efficient but this is an alternative.
results <- list()
for ( i in 1:length(which(names(df) %in% names(weights))) ) {
idx1 <- which(names(df) %in% names(weights))[i]
idx2 <- which(names(weights) %in% names(df))[i]
results[[i]] <- dat[,idx1] * weights[idx2]
}
unlist(results)

Related

Creating a non overlapping bins in R

I have a set of x,y data (10,000). These data points are to be partitioned along the x-axis into non-overlapping bins of 10 data points each. From this, I need a new dataset, such that x = mean of these 10 data, y = maximum of these 10 data. The final data set should be 1000 sets of x,y. sample
Sample in Excel. I want to perform this task in R
In tidyverse:
library(tidyverse)
df %>%
arrange(x) %>%
group_by(grp = gl(n(), 10, n())) %>%
summarise(x = mean(x), y = max(y))
In Base R
n <- nrow(df)
do.call(rbind.data.frame, by(df[order(df$x),], gl(n, 10, n),
function(x) cbind(x = mean(x$x), y = max(x$y))))
I created some sample data as you did not provide those.
I use the library data.table but you could do similar in dplyr or base.
library(data.table)
dt <- data.table(
x = sample(40:50, 50, replace = T),
y = sample(1000:3000, 50)
)
dt[, grp := gl(.N, 10, .N)] # edit based on Onyambu's solution
dt[, .(x_avg = mean(x), y_max = max(y)), by = grp]
# grp x_avg y_max
# 1: 1 44.7 2765
# 2: 2 45.3 2861
# 3: 3 44.7 2831
# 4: 4 46.2 2947
# 5: 5 46.7 2684

Filter using Dataset Position in R

I'm not really familiar with dplyr function in R. However, I want to filter my dataset into certain conditions.
Let's say I've more than 100 of attributes in my dataset. And I want to perform filter with multiple condition.
Can I put my coding filter the position of the column instead of their name as follow:
y = filter(retag, c(4:50) != 8 & c(90:110) == 8)
I've tried few times similar with this coding, however still haven't get the result.
I also did tried coding as follow, but not sure how to add another conditions into the rowSums function.
retag[rowSums((retag!=8)[,c(4:50)])>=1,]
The only example that I found was using the dataset names instead of the position.
Or is there any way to filter using the dataset position as my data quite huge.
You can use a combination of filter() and across(). I didn't have your version of the retag dataframe so I created my own as an example
set.seed(2000)
retag <- tibble(
col1 = runif(n = 1000, min = 0, max = 10) %>% round(0),
col2 = runif(n = 1000, min = 0, max = 10) %>% round(0),
col3 = runif(n = 1000, min = 0, max = 10) %>% round(0),
col4 = runif(n = 1000, min = 0, max = 10) %>% round(0),
col5 = runif(n = 1000, min = 0, max = 10) %>% round(0)
)
# filter where the first, second, and third column all equal 5 and the fourth column does not equal 5
retag %>%
filter(
across(1:3, function(x) x == 5),
across(4, function(x) x != 5)
)
if_all() and if_any() were recently introduced into the tidyverse for the purpose of filtering across multiple variables.
library(dplyr)
filter(retag, if_all(X:Y, ~ .x > 10 & .x < 35))
# # A tibble: 5 x 2
# X Y
# <int> <int>
# 1 11 30
# 2 12 31
# 3 13 32
# 4 14 33
# 5 15 34
filter(retag, if_any(X:Y, ~ .x == 2 | .x == 25))
# # A tibble: 2 x 2
# X Y
# <int> <int>
# 1 2 21
# 2 6 25
Data
retag <- structure(list(X = 1:20, Y = 20:39), row.names = c(NA, -20L), class = c("tbl_df",
"tbl", "data.frame"))
Here's a base R option.
This will select rows where there is no 8 in column 4 to 50 and there is at least one 8 in column 90 to 110.
result <- retag[rowSums(retag[4:50] == 8, na.rm = TRUE) == 0 &
rowSums(retag[90:110] == 8,na.rm = TRUE) > 0, ]

Distilling summary statistics by numerical categories with dplyr

I have a large (rows > 200000) data frame with dozens of columns of data. I want to distill this data frame down and summarize the number of data that have variables that fall within given ranges.
For instance, if I have a data.frame that is similar to this:
set.seed(10)
df <- data.frame( age = runif( n = 1000, min = 0, max = 4000 ),
size = rnorm( n = 1000, mean = 10, sd = 1 ),
shape = rnorm( n = 1000, mean = 1000, sd = 1000) )
and I would like to group get the number of samples within a series of age ranges, the mean size and shape, and the median size and shape from the samples in each of those age brackets.
Something like
summary.df <- data.frame( age.group = seq( 0, 3900, by = 100 ),
number = (number of samples in age bin),
mean = ( mean of data in age bin ) )
etc.
Right now I am doing this very bluntly by creating a new data.frame for each age group.
data.1 <- subset( df, age > 0 & age <= 100 )
data.2 <- subset( df, age > 100 & age <= 200 )
data.3 <- subset( df, age > 200 & age <= 300 )
etc.
and then adding a categorical variable
data.1 <- data.frame( data.1, age.group = "100", count.row = nrow( data.1 ) )
data.2 <- data.frame( data.2, age.group = "200", count.row = nrow( data.2 ) )
data.3 <- data.frame( data.3, age.group = "300", count.row = nrow( data.3 ) )
adding them together
data.big <- rbind( data.1, data.2, data.3 )
and then generating summary stats via dplyr
data.summary <- data.big %>%
group_by( age.group ) %>%
summarize( count.row = mean( count.row ),
mean = mean( size, na.rm = TRUE ),
median = median( size, na.rm = T ) )
How would I go about doing this more efficiently with just dplyr? I think there must be a way but I can't wrap my head around it.
Thanks for any help you can give!
You can make use of cut to divide the data in intervals of 100 and calculate summary statistics for each group.
library(dplyr)
df %>%
group_by(age = cut(age, seq( 0, 4000, by = 100))) %>%
summarise(mean = mean( size, na.rm = TRUE),
median = median( size, na.rm = TRUE))
# age mean median
# <fct> <dbl> <dbl>
# 1 (0,100] 10.0 9.92
# 2 (100,200] 9.88 10.2
# 3 (200,300] 10.1 10.1
# 4 (300,400] 9.83 9.80
# 5 (400,500] 9.95 9.72
# 6 (500,600] 9.68 9.78
# 7 (600,700] 10.2 10.5
# 8 (700,800] 10.2 10.4
# 9 (800,900] 9.68 9.47
#10 (900,1e+03] 9.80 9.81
# … with 30 more rows

finding outliers and counting number of occurrence

I have a data frame as below
raw_data <- data.frame(
"id" = c(1, 1, 1, 2, 2),
"salary" = c(10000,15000,20000,40000,50000),
"expenditure" = c(10000,15000,20000,30000,40000))
if a salary greater than 15000 would be flaged as an outlier, and if an expenditure is greater than 10000, it should be flaged as an outlier. But the problem is now, how to count how many times an outlier (both spearately) occured by a specific id. The output should look like the following df
output <- data.frame(
"id"=c(1,1,1,2,2,2,2),
"question_name"=c("expenditure", "salary","expenditure","salary","expenditure","salary","expenditure"),
"values"=c(15000,20000,20000,30000,40000,500000,40000),
"count"=c(1,1,1,1,1,1,1))
Here's a dplyr solution:
raw_data %>%
mutate(salary_flag =
ifelse(salary > 15000, 1, 0),
expenditure_flag = ifelse(expenditure > 10000, 1, 0)) %>%
group_by(id) %>%
mutate(total_outlier = sum(salary_flag) + sum(expenditure_flag))
You are flagging for salary and expenditure, then grouping by id and calculating the sum of all salary_flag and the sum of all expenditure_flag for each id.
id salary expenditure salary_flag expenditure_flag total_outlier
<int> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 10000 10000 0 0 0
2 2 15000 15000 0 1 1
3 3 20000 20000 1 1 2
4 4 40000 30000 1 1 2
5 5 50000 40000 1 1 2
If you're only concerned with the total outliers, #MartinGal provided a very nice option:
raw_data %>%
group_by(id) %>%
mutate(total_outlier = sum(salary>15000, expenditure>10000))
Gives us:
id salary expenditure total_outlier
<int> <dbl> <dbl> <int>
1 1 10000 10000 0
2 2 15000 15000 1
3 3 20000 20000 2
4 4 40000 30000 2
5 5 50000 40000 2
edit:
This seems to get the end result that you're looking for:
raw_data %>%
group_by(id) %>%
summarise(count = sum(salary>15000, expenditure>10000),
value = min(salary)) %>%
mutate(title = "salary") %>%
select(id, title, value, count)
Which gives you:
id title value count
<int> <chr> <dbl> <int>
1 1 salary 10000 0
2 2 salary 15000 1
3 3 salary 20000 2
4 4 salary 40000 2
5 5 salary 50000 2
Raw data is :
raw_data <- data.frame("id" = c(1, 1, 1, 2, 2),
"salary" = c(10000,15000,20000,40000,50000),
"expenditure" = c(10000,15000,15000,30000,40000))
And the solution is :
raw_data <- raw_data %>% filter(salary>15000 | expenditure>10000)
entry_variables <- raw_data %>%select(id,salary,expenditure) %>%
pivot_longer(cols = -id,
names_to = "Question_name", values_to= "Value",
values_drop_na = TRUE) %>%
count(id, Question_name, Value)
You can try the following
raw_data <- data.frame("id" = 1:5,
"salary" = c(10000,15000,20000,40000,50000),
"expenditure" = c(10000,15000,20000,30000,40000))
raw_data$SaleryOutlier <- ifelse(
raw_data$salary > 15000, TRUE, FALSE)
raw_data$ExpenditureOutlier <- ifelse(
raw_data$expenditure > 10000, TRUE, FALSE)
You can then use aggregate function to summarize the data, e.g. for each id by using FUN=sum. This should look like
aggregate(raw_data, by=list(id = raw_data$id), FUN=sum)
This works because TRUE=1.
I hope this helps.
EDIT
Based on your comment, I guess you are looking for
raw_data <- data.frame("id" = c(1, 1, 1, 2, 2),
"salary" = c(10000,15000,20000,40000,50000),
"expenditure" = c(10000,15000,20000,30000,40000))
raw_data$SaleryOutlier <- ifelse(
raw_data$salary > 15000, TRUE, FALSE)
raw_data$ExpenditureOutlier <- ifelse(
raw_data$expenditure > 10000, TRUE, FALSE)
raw_data_aggregate <- aggregate(raw_data, by=list(id = raw_data$id), FUN=sum)
raw_data_aggregate$count <- raw_data_aggregate$SaleryOutlier + raw_data_aggregate$ExpenditureOutlier
EDIT TWO
If you want to aggregate over two variables, just exchange the above aggregate with
raw_data_aggregate <- aggregate(
SalaryOutlier + ExpenditureOutlier ~ id + salary + expenditure, raw_data, FUN=sum)
EDIT THREE
Based on the comments below, I created the following code
raw_data <- data.frame(
"id" = c(1, 1, 1, 2, 2),
"salary" = c(10000,15000,20000,40000,50000),
"expenditure" = c(10000,15000,20000,30000,40000))
# Identify salary outliers
raw_data$SalaryOutlier <- ifelse(
raw_data$salary > 15000, TRUE, FALSE)
# Identify expenditure outliers
raw_data$ExpenditureOutlier <- ifelse(
raw_data$expenditure > 10000, TRUE, FALSE)
# Aggregate over id + salay
raw_data_aggregate_salary <- aggregate(
SalaryOutlier ~ id + salary, raw_data, FUN=sum)
# Aggregate over id + expenditure
raw_data_aggregate_expenditure <- aggregate(
ExpenditureOutlier ~ id + expenditure, raw_data, FUN=sum)
# Just some renaming to fit with desired output data frame.
raw_data_aggregate_salary$question_name <- "salary"
raw_data_aggregate_expenditure$question_name <- "expenditure"
colnames(raw_data_aggregate_salary)[2] <- "values"
colnames(raw_data_aggregate_expenditure)[2] <- "values"
colnames(raw_data_aggregate_salary)[3] <- "count"
colnames(raw_data_aggregate_expenditure)[3] <- "count"
# Bind result together into one df.
raw_data_aggregate <- rbind(
raw_data_aggregate_salary, raw_data_aggregate_expenditure)
# Only select entries where we actually have a count.
raw_data_aggregate <- subset(
raw_data_aggregate,
raw_data_aggregate$count > 0)
# Order to fit with desired output
raw_data_aggregate <- raw_data_aggregate[ order(raw_data_aggregate$id), ]
In a data.table this would look like
raw_data[, flag0 := (salary > 15000) + (expenditure > 10000)]
raw_data[, flag := sum(flag0), by = "id"]
Here flag0 is the flag by row (which can later be deleted if you like) and flag would be the final result.
Edit: Seeing your reply to #Matt, you seem to want the total amount by salary and expenditure seperately. You can do something like
raw_data[, flag_salary := as.integer(salary > 15000)]
raw_data[, flag_expenditure := as.integer(expenditure > 10000)]
raw_data[, flag_salary := sum(flag_salary), by = "id"]
raw_data[, flag_expenditure := sum(flag_expenditure), by = "id"]

Creating a new dataframe based on statistics calculated from a previous data frame

I have the following data frame
dat <- data.frame(ID = c(1, 1, 1, 2, 2, 2),
A = c(50, 150, 200, 250, 100, NA),
B = c(10, NA, 30, NA, NA, 10))
I want to create a new data frame that tells me the percent of points that are not NA such that
dat2 <- data.frame(ID = c(1, 2),
A =c( 100, 66.6),
B = c(66.6, 33.3))
I can figure how to calculate the values in R, but I can't figure out how to place the calculations into a new data frame.
We can do a group by 'ID' with mean of the non-NA logical vector
library(dplyr)
dat %>%
group_by(ID) %>%
summarise_all(~ 100 *mean(!is.na(.)))
# A tibble: 2 x 3
# ID A B
# <dbl> <dbl> <dbl>
#1 1 100 66.7
#2 2 66.7 33.3
Or using aggregate from base R
aggregate(. ~ ID, dat, FUN = function(x) 100 *mean(!is.na(x)), na.action = NULL)
With data.table, you can do that very efficiently :
library(data.table)
setDT(data)
dat[,.(lapply(.SD, function(x) 100*mean(!is.na(x))), by = "ID"]
.SD means Subset of Data you apply the mean + is.na combination over all your columns. You can also apply that to a subset of columns using .SDcols (e.g. A column) :
dat[,.(lapply(.SD, function(x) 100*mean(!is.na(x))),
by = "ID", .SDcols = c("A")]

Resources