aggregate all columns in r

aggregate all columns in r - r

So this is what I do I total up their grades by using this, there are more columns than this
test<-with(data,table(Student,Subject))
test <- cbind(test)
test <- as.data.frame(test)
row.names Maths Science English Geography History Art ...
1 George 64 70 40 50 60 70
2 Anna 40 20 65 54 30 50
3 Scott 30 64 30 40 50 20
...
Summarize <- data.frame(
aggregate(.~Maths, data=test, min),
aggregate(.~English, data=test, max),
aggregate(.~Science, data=test, mean))
Is there a way to select all the columns itself and aggregate(range and the mean) the columns to a new dataframe?
Min Mean Max
Maths 30 60 90
Science
English
Geography
...
Thanks in advance !

Try:
library(dplyr)
library(tidyr)
df %>%
summarise_each(funs(min=min(., na.rm=TRUE), max=max(., na.rm=TRUE),
mean=mean(., na.rm=TRUE)), -Student) %>%
gather(Var, Value, Maths_min:Art_mean) %>%
separate(Var, c("Subject", "Var")) %>%
spread(Var, Value)
# Subject max mean min
#1 Art 70 46.66667 20
#2 English 65 45.00000 30
#3 Geography 54 48.00000 40
#4 History 60 46.66667 30
#5 Maths 64 44.66667 30
#6 Science 70 51.33333 20
Update
Or you could use aggregate with melt
library(reshape2)
res <- aggregate(value~variable, melt(df, id="Student"),
FUN=function(x) c(Min=min(x, na.rm=TRUE), Mean=mean(x, na.rm=TRUE),
Max=max(x, na.rm=TRUE)))
res1 <- do.call(`data.frame`, res)
colnames(res1) <- gsub(".*\\.", "", colnames(res1))
res1
# variable Min Mean Max
#1 Maths 30 44.66667 64
#2 Science 20 51.33333 70
#3 English 30 45.00000 65
#4 Geography 40 48.00000 54
#5 History 30 46.66667 60
#6 Art 20 46.66667 70
Or using only base R
res2 <- do.call(`data.frame`,
aggregate(values~ind, stack(df, select=-1),
FUN=function(x) c(Min=min(x, na.rm=TRUE), Mean=mean(x, na.rm=TRUE),
Max=max(x, na.rm=TRUE))))
colnames(res2) <- gsub(".*\\.", "", colnames(res2))
res2
# ind Min Mean Max
#1 Art 20 46.66667 70
#2 English 30 45.00000 65
#3 Geography 40 48.00000 54
#4 History 30 46.66667 60
#5 Maths 30 44.66667 64
#6 Science 20 51.33333 70
data
df <- structure(list(Student = c("George", "Anna", "Scott"), Maths = c(64L,
40L, 30L), Science = c(70L, 20L, 64L), English = c(40L, 65L,
30L), Geography = c(50L, 54L, 40L), History = c(60L, 30L, 50L
), Art = c(70L, 50L, 20L)), .Names = c("Student", "Maths", "Science",
"English", "Geography", "History", "Art"), class = "data.frame", row.names = c(NA,
-3L))

Related

How to get column mean grouped by row labels in R dataframe?

I have a dataframe that looks like this
Fruit
2021
2022
Apples
12
29
Bananas
11
31
Apples
44
55
Oranges
30
73
Oranges
19
82
Bananas
24
78
The Fruit names are not ordered so I can't group them by taking n at a time, they're listed randomly. I need to get the mean of fruits sold in 2021 & 2022 as well as mean sold for apples, oranges & bananas for each year separately.
My code is
2021 <- c(mean(df$2021), sd(df$2021))
2022 <- c(mean(df$2022), sd(df$2022))
measure <- c('mean','standard deviation')
df1 <- data.table(measure,TE,TW,NC,SC,NWC)
and output looks like this:
Measure
2021
2022
mean
23.3
58
standard deviation
12.4
23.3
But I'm not sure where to start with grouping the rows by name. I need to get something that looks like this
Measure
2021
Apples
Bananas
Oranges
2022
Apples
Bananas
Oranges
mean
23.3
58
standard deviation
12.4
23.3
(with the appropriate numbers in the blank spaces)

I suggest this might be better (in the long run) in a long format, which this summarizing can get started. This is just 'mean', not hard to repeat for sd and combine with this:
fruits <- c(NA, "Apples", "Oranges", "Bananas")
lapply(quux[,-1], function(yr) stack(sapply(fruits, function(z) mean(yr[is.na(z) | quux$Fruit %in% z])))) |>
dplyr::bind_rows(.id = "year")
# year values ind
# 1 2021 23.33333 <NA>
# 2 2021 28.00000 Apples
# 3 2021 24.50000 Oranges
# 4 2021 17.50000 Bananas
# 5 2022 58.00000 <NA>
# 6 2022 42.00000 Apples
# 7 2022 77.50000 Oranges
# 8 2022 54.50000 Bananas
where NA in ind indicates all fruits, otherwise the individual fruit labeled.

If you put your data in long form, you could use the aggregate function:
a <- aggregate(value ~ year + fruit, data=df, FUN=function(x) c(sd(x),mean(x))
Where value is a column you could create to put the values which are now under 2021 and 2022. Then create a new column called year which has 2021 or 2022 accordingly. Long form is the way to go in R almost always.

We may use
library(dplyr)
library(tidyr)
library(data.table)
library(stringr)
df1 %>%
pivot_longer(cols = where(is.numeric), names_to = 'year') %>%
as.data.table %>%
cube( .(Mean = mean(value), SD = sd(value)),
by = c("Fruit", "year")) %>%
filter(!if_all(Fruit:year, is.na)) %>%
unite(Fruit, Fruit, year, sep = "_", na.rm = TRUE) %>%
filter(str_detect(Fruit, "_|\\d+")) %>%
data.table::transpose(make.names = "Fruit", keep.names = "Measure")
-output
Measure Apples_2021 Apples_2022 Bananas_2021 Bananas_2022 Oranges_2021 Oranges_2022 2021 2022
1: Mean 28.00000 42.00000 17.500000 54.50000 24.500000 77.500000 23.33333 58.00000
2: SD 22.62742 18.38478 9.192388 33.23402 7.778175 6.363961 12.42041 23.57965
Or if we want the duplicate column names
df1 %>%
pivot_longer(cols = where(is.numeric), names_to = 'year') %>%
as.data.table %>%
cube( .(Mean = mean(value), SD = sd(value)), by = c("Fruit", "year")) %>%
mutate(Fruit = coalesce(Fruit, year)) %>%
drop_na(year) %>%
arrange(year, str_detect(Fruit, '\\d{4}', negate = TRUE)) %>%
select(-year) %>%
data.table::transpose(make.names = "Fruit", keep.names = "Measure")
-output
Measure 2021 Apples Bananas Oranges 2022 Apples Bananas Oranges
1: Mean 23.33333 28.00000 17.500000 24.500000 58.00000 42.00000 54.50000 77.500000
2: SD 12.42041 22.62742 9.192388 7.778175 23.57965 18.38478 33.23402 6.363961
data
df1 <- structure(list(Fruit = c("Apples", "Bananas", "Apples", "Oranges",
"Oranges", "Bananas"), `2021` = c(12L, 11L, 44L, 30L, 19L, 24L
), `2022` = c(29L, 31L, 55L, 73L, 82L, 78L)),
class = "data.frame", row.names = c(NA,
-6L))

Calculating Percent Change in R for Multiple Variables

I'm trying to calculate percent change in R with each of the time points included in the column label (table below). I have dplyr loaded and my dataset was loaded in R and I named it data. Below is the code I'm using but it's not calculating correctly. I want to create a new dataframe called data_per_chg which contains the percent change from "v1" each variable from. For instance, for wbc variable, I would like to calculate percent change of wbc.v1 from wbc.v1, wbc.v2 from wbc.v1, wbc.v3 from wbc.v1, etc, and do that for all the remaining variables in my dataset. I'm assuming I can probably use a loop to easily do this but I'm pretty new to R so I'm not quite sure how proceed. Any guidance will be greatly appreciated.
id
wbc.v1
wbc.v2
wbc.v3
rbc.v1
rbc.v2
rbc.v3
hct.v1
hct.v2
hct.v3
a1
23
63
30
23
56
90
13
89
47
a2
81
45
46
N/A
18
78
14
45
22
a3
NA
27
14
29
67
46
37
34
33
data_per_chg<-data%>%
group_by(id%>%
arrange(id)%>%
mutate(change=(wbc.v2-wbc.v1)/(wbc.v1))
data_per_chg

Assuming the NA values are all NA and no N/A
library(dplyr)
library(stringr)
data <- data %>%
na_if("N/A") %>%
type.convert(as.is = TRUE) %>%
mutate(across(-c(id, matches("\\.v1$")), ~ {
v1 <- get(str_replace(cur_column(), "v\\d+$", "v1"))
(.x - v1)/v1}, .names = "{.col}_change"))
-output
data
id wbc.v1 wbc.v2 wbc.v3 rbc.v1 rbc.v2 rbc.v3 hct.v1 hct.v2 hct.v3 wbc.v2_change wbc.v3_change rbc.v2_change rbc.v3_change hct.v2_change hct.v3_change
1 a1 23 63 30 23 56 90 13 89 47 1.7391304 0.3043478 1.434783 2.9130435 5.84615385 2.6153846
2 a2 81 45 46 NA 18 78 14 45 22 -0.4444444 -0.4320988 NA NA 2.21428571 0.5714286
3 a3 NA 27 14 29 67 46 37 34 33 NA NA 1.310345 0.5862069 -0.08108108 -0.1081081
If we want to keep the 'v1' columns as well
data %>%
na_if("N/A") %>%
type.convert(as.is = TRUE) %>%
mutate(across(ends_with('.v1'), ~ .x - .x,
.names = "{str_replace(.col, 'v1', 'v1change')}")) %>%
transmute(id, across(ends_with('change')),
across(-c(id, matches("\\.v1$"), ends_with('change')),
~ {
v1 <- get(str_replace(cur_column(), "v\\d+$", "v1"))
(.x - v1)/v1}, .names = "{.col}_change")) %>%
select(id, starts_with('wbc'), starts_with('rbc'), starts_with('hct'))
-output
id wbc.v1change wbc.v2_change wbc.v3_change rbc.v1change rbc.v2_change rbc.v3_change hct.v1change hct.v2_change hct.v3_change
1 a1 0 1.7391304 0.3043478 0 1.434783 2.9130435 0 5.84615385 2.6153846
2 a2 0 -0.4444444 -0.4320988 NA NA NA 0 2.21428571 0.5714286
3 a3 NA NA NA 0 1.310345 0.5862069 0 -0.08108108 -0.1081081
data
data <- structure(list(id = c("a1", "a2", "a3"), wbc.v1 = c(23L, 81L,
NA), wbc.v2 = c(63L, 45L, 27L), wbc.v3 = c(30L, 46L, 14L), rbc.v1 = c("23",
"N/A", "29"), rbc.v2 = c(56L, 18L, 67L), rbc.v3 = c(90L, 78L,
46L), hct.v1 = c(13L, 14L, 37L), hct.v2 = c(89L, 45L, 34L), hct.v3 = c(47L,
22L, 33L)), class = "data.frame", row.names = c(NA, -3L))

How can I merge different data sets in R knowing that the variable that I use for matching the two data set are not unique?

I have two datasets, and I need to merge them by the ID value. The problems are:
The ID value can be repeated across the same dataset (no other unique value is available).
The two datasets are not equal in the rows number or the column numbers.
Example:
df1
ID
Gender
99
Male
85
Female
7
Male
df2
ID
Body_Temperature
Body_Temperature_date_time
99
36
1/1/2020 12:00 am
99
38
2/1/2020 10:30 am
99
37
1/1/2020 06:41 am
52
38
1/2/2020 11:00 am
11
39
4/5/2020 09:09 pm
7
35
9/8/2020 02:30 am
How can I turn these two datasets into one single dataset in a way that allows me to apply some machine learning models on it later on?

Depending on your expected results, if you are wanting to return all rows from each dataframe, then you can use a full_join from dplyr:
library(dplyr)
full_join(df2, df1, by = "ID")
Or with base R:
merge(x=df2,y=df1,by="ID",all=TRUE)
Output
ID Body_Temperature Body_Temperature_date_time Gender
1 99 36 1/1/2020 12:00 am Male
2 99 38 2/1/2020 10:30 am Male
3 99 37 1/1/2020 06:41 am Male
4 52 38 1/2/2020 11:00 am <NA>
5 11 39 4/5/2020 09:09 pm <NA>
6 7 35 9/8/2020 02:30 am Male
7 85 NA <NA> Female
If you have more than 2 dataframes to combine, which only overlap with the ID column, then you can use reduce on a dataframe list (so put all the dataframes that you want to combine into a list):
library(tidyverse)
df_list <- list(df1, df2)
multi_full <- reduce(df_list, function(x, y, ...)
full_join(x, y, by = "ID", ...))
Or Reduce with base R:
df_list <- list(df1, df2)
multi_full <- Reduce(function(x, y, ...)
merge(x, y, by = "ID", all = TRUE, ...), df_list)
Data
df1 <- structure(list(ID = c(99L, 85L, 7L), Gender = c("Male", "Female",
"Male")), class = "data.frame", row.names = c(NA, -3L))
df2 <- structure(list(ID = c(99L, 99L, 99L, 52L, 11L, 7L), Body_Temperature = c(36L,
38L, 37L, 38L, 39L, 35L), Body_Temperature_date_time = c("1/1/2020 12:00 am",
"2/1/2020 10:30 am", "1/1/2020 06:41 am", "1/2/2020 11:00 am",
"4/5/2020 09:09 pm", "9/8/2020 02:30 am")), class = "data.frame", row.names = c(NA,
-6L))

Correlation of similar variables in R

I have slightly edited the data table.
I would like to correlate variable with similar name in my dataset:
A_y B_y C_y A_p B_p C_p
1 15 52 32 30 98 56
2 30 99 60 56 46 25
3 10 25 31 20 22 30
..........
n 55 23 85 12 34 52
I would like to obtain correlation of
A_y-A_p: 0.78
B_y-B_p: 0.88
C_y-C_p: 0.93
How can I do it in R? Is it possible?

This is really dangerous. Behavior of data.frames with invalid column names is undefined by the language definition. Duplicated column names are invalid.
You should restructure your input data. Anyway, here is an approach with your input data.
DF <- read.table(text = " A B C A B C
1 15 52 32 30 98 56
2 30 99 60 56 46 25
3 10 25 31 20 22 30", header = TRUE, check.names = FALSE)
sapply(unique(names(DF)), function(s) do.call(cor, unname(DF[, names(DF) == s])))
# A B C
#0.9995544 0.1585501 -0.6004010
#compare:
cor(c(15, 30, 10), c(30, 56, 20))
#[1] 0.9995544

Here is another base R option
within(
rev(
stack(
Map(
function(x) do.call(cor, unname(x)),
split.default(df, unique(gsub("_.*", "", names(df))))
)
)
),
ind <- sapply(
ind,
function(x) {
paste0(grep(paste0("^", x), names(df), value = TRUE),
collapse = "-"
)
}
)
)
which gives
ind values
1 A_y-A_p 0.9995544
2 B_y-B_p 0.1585501
3 C_y-C_p -0.6004010
Data
df <- structure(list(A_y = c(15L, 30L, 10L), B_y = c(52L, 99L, 25L),
C_y = c(32L, 60L, 31L), A_p = c(30L, 56L, 20L), B_p = c(98L,
46L, 22L), C_p = c(56L, 25L, 30L)), class = "data.frame", row.names = c("1",
"2", "3"))

How to convert denormalized data to normalized data and vice versa or transpose in R

I have data in the following format:
Japan_n US_n Canada_n Japan_mean US_mean Canada_mean Japan_cv US_cv Canada_cv
76 55 89 145.49 163.78 122.18 23.12 25.47 13.85
I want my output in following format in R:
n mean cv
Japan 76 145.99 23.12
US 55 163.78 25.47
Canada 89 122.18 13.85
How can I do this?

Try this
library(dplyr)
library(tidyr)
data <- structure(list(Japan_n = 76L,
US_n = 55L,
Canada_n = 89L,
Japan_mean = 145.49,
US_mean = 163.78,
Canada_mean = 122.18,
Japan_cv = 23.12,
US_cv = 25.47,
Canada_cv = 13.85),
.Names = c("Japan_n", "US_n",
"Canada_n", "Japan_mean",
"US_mean", "Canada_mean",
"Japan_cv", "US_cv", "Canada_cv"),
class = "data.frame",
row.names = c(NA, -1L))
my_solution <-
gather(data, col_01, col_02) %>%
separate(., col_01, into = c("geo", "stat")) %>%
spread(., stat, col_02)
my_solution
geo cv mean n
1 Canada 13.85 122.18 89
2 Japan 23.12 145.49 76
3 US 25.47 163.78 55