Wide Format Summary in tidyverse - r

Hi I have a dataframe in wide format that is grouped by Site. Each column represents the abundance a different species(85 total). I am trying to summarize the dataframe to calculate the total number of individuals regardless of species in my data.
df.totals<- df %>% group_by(Site) %>% summarize (total = sum(6:91))

We can gather to 'long' format and then do the sum
library(tidyverse)
df %>%
select(Site, 6:91) %>%
rownames_to_column("rn") %>%
gather(key, val, - Site, -rn) %>%
group_by(Site, rn) %>%
summarise(total = sum(val))
or another option without gathering would be
df %>%
select(Site, 6:91) %>%
transmute(vs, Sum = reduce(.[2:ncol(.)], `+`)) %>%
group_by(Site) %>%
summarise(Sum = sum(Sum))
Using a reproducible example with mtcars
mtcars %>%
select(vs, 4:6) %>%
transmute(vs, Sum = reduce(.[2:ncol(.)], `+`)) %>%
group_by(vs) %>%
summarise(Sum = sum(Sum))

Related

comparing the variables and their values between two data frames

I have two data frames with same kind of data, now i want to check for all the columns in both data frames have same kind of text in all columns in both data frames .
so for example the column name "sales executives" in both data frames have exact name "Micheal klay" in both data frames but if there is any spelling error or extra space i want to show it as not matching.
I have tried below approach and its working for small database but because my data is very big, data having approx 10 - 40 millions or records so its showing error
do we have any solution or any other approach to do that
cannot allocate vector of size 3.2GB
library(tidyverse)
df1 <- data.frame(MAN=c(6,6,4,6,8,6,8,4,4,6,6,8,8),MANi=c("OD","NY","CA","CA","OD","CA","OD","NY","OL","NY","OD","CA","OD"),
nune=c("akas","mani","juna","mau","nuh","kil","kman","nuha","huna","kman","nuha","huna","mani"),
klay=c(1,2,2,1,1,2,1,2,1,2,1,1,2),emial=c("dd","xyz","abc","dd","xyz","abc","dd","xyz","abc","dd","xyz","abc","dd"),Pass=c("Low","High","Low","Low","High","Low","High","High","Low","High","High","High","Low"),fri=c("KKK","USA","IND","SRI","PAK","CHI","JYP","TGA","KKK","USA","IND","SRI","PAK"),
mkl=c("m","f","m","m","f","m","m","f","m","m","f","m","m"),kin=c("Sent","Rec","Sent","Rec","Sent","Rec","Sent","Rec","Sent","Rec","Rec","Sent","Rec"),munc=c("Car","Bus","Truk","Cyl","Bus","Car","Bus","Bus","Bus","Car","Car","Cyl","Car"),
lone=c("Sr","jun","sr","jun","man","man","jr","Sr","jun","sr","jun","man","man"),wond=c("tko","kent","bho","kilt","kent","bho","kent","bho","bho","kilt","kent","bho","kilt"))
df2 <- data.frame(MAN=c(6,6,4,6,8,6,8,4,4,6,6,8,8,8,6),MANi=c("OD","NY","CA","CA","OD","CA","OD","NY","OL","ny","OD","CA","OD","NY","OL"),
nune=c("akas","mani","juna","mau","nuh","kil","kman","nuha","huna","kman","nuha","huna","mani","juna","mau"),
klay=c(1,2,2,1,1,2,1,2,1,2,1,1,2,2,1),emial=c("dd","xyz","ABC","dd","xyz","ABC","dd","xyz","ABC","dd","xyz","ABC","dd","xyz","ABC"),Pass=c("Low","High","Low","Low","High","Low","High","High","Low","High","High","High","Low","High","High"),fri=c("KKK","USA","IND","SRI","PAK","CHI","JYP","TGA","KKK","USA","IND","SRI","PAK","CHI","JYP"),
mkl=c("male","female","male","male","female","male","male","female","male","male","female","male","male","female","male"),kin=c("Sent","Rec","Sent","Rec","Sent","Rec","Sent","Rec","Sent","Rec","Rec","Sent","Rec","Sent","Rec"),munc=c("Car","Bus","Truk","Cyl","Bus","Car","Bus","Bus","Bus","Car","Car","Cyl","Car","Bus","Bus"),
lone=c("Sr","jun","sr","jun","man","man","jr","Sr","jun","sr","jun","man","man","jr","man"),wond=c("tko","kent","bho","kilt","kent","bho","kent","bho","bho","kilt","kent","bho","kilt","kent","bho"))
df1_long <- df1 %>%
as_tibble() %>%
mutate_if(is.double, as.character) %>% distinct() %>%
pivot_longer(everything(), names_to = "Names", values_to = "options") %>%
arrange(Names, options)
df2_long <- df2 %>%
as_tibble() %>%
mutate_if(is.double, as.character) %>% distinct() %>%
pivot_longer(everything(), names_to = "Names", values_to = "options") %>%
arrange(Names, options)
T1 <- df1_long %>%
full_join(df2_long, by=c("Names", "options"), keep = TRUE) %>%
distinct(Names.x, options.x, Names.y, options.y) %>%
arrange(Names.x, Names.y, options.x, options.y) %>%
mutate(
consistant_names = !is.na(Names.x) & !is.na(Names.y),
consistant_options = !is.na(options.x) & !is.na(options.y)
)
the output required like below
below are inconsistency between data bases

double type object (characters and numbers) to dataframe

I want to create a dataframe (in order to filter) from a "double" object which contains both character and numbers.
My code:
select(Tick,Date,Return) %>%
pivot_wider(names_from = Tick,values_from = Return) %>%
select(-Date) %>%
colMeans()*100```
The output is a named vector. We can stack it to create a data.frame with two columns
library(dplyr)
library(tidyr)
select(Tick,Date,Return) %>%
pivot_wider(names_from = Tick,values_from = Return) %>%
select(-Date) %>%
{colMeans(.)*100} %>%
stack
Or use pivot_longer
select(Tick,Date,Return) %>%
pivot_wider(names_from = Tick,values_from = Return) %>%
select(-Date) %>%
{colMeans(.)*100 }%>%
pivot_longer(cols = everything())

Automatically align dplyr pipes

Is it possible to automatically align dplyr pipes in RStudio?
For example:
data %>%
mutate(something = first + second) %>%
select(first, something) %>%
group_by(first) %>%
summarize(mean = mean(something))
Into:
data %>%
mutate(something = first + second) %>%
select(first, something) %>%
group_by(first) %>%
summarize(mean = mean(something))
Manytnx!

How to mutate values of a tibble in long format

I want to normalize using min/max the values of two indicators. Is it possible to do it keeping the tibble in long format? (Below I use left join to do it in wide format).
library(tidyverse)
df <- tibble(ind =c(1, 2),
`2015` = c(3,10),
`2016` = c(7,18),
`2017` = c(1,4))
# long format
df2 <- df %>%
gather("year", "value", 2:4)
df3 <- df2 %>%
group_by(ind) %>%
summarise(mn = min(value),
mx = max(value))
# wide format?
df4 <- left_join(df2, df3, by = c("ind"="ind"))
df5 <- df4 %>%
mutate(value2 = (value-mn)/(mx-mn))
Created on 2019-10-07 by the reprex package (v0.3.0)
Instead of doing the left_join, can create the columns with mutate and avoid the summarise step
library(dplyr)
df2 %>%
group_by(ind) %>%
mutate(mn = min(value), mx = max(value)) %>%
ungroup %>%
mutate(value2 = (value - mn)/(mx-mn))
NOTE: Here, we assumed the OP wanted the columns 'mx', 'mn' in the final output. But, if the intention is to get only 'value2', there is no need for creating additional columns as #Gregor mentioned in the comments
df2 %>%
group_by(ind) %>%
mutate(value2 = (value - min(value))/(max(value) - min(value)))
Also, with the tidyr_1.0.0, instead of gather, can use pivot_longer which is more generalized as it can deal with multiple sets of columns to reshape from 'wide' to 'long'
library(tidyr)
df %>%
pivot_longer(cols = -ind) %>%
group_by(ind) %>%
mutate(mn = min(value), mx = max(value)) %>%
ungroup %>%
mutate(value2 = (value - mn)/(mx-mn))

Mutating values of subset of columns into percentage format

I have generated this summary table based on the df below.
set.seed(1)
df <- data.frame(rep(
sample(c(2012,2016),10, replace = T)),
sample(c('Treat','Control'),10,replace = T),
runif(10,0,1),
runif(10,0,1),
runif(10,0,1))
colnames(df) <- c('Year','Group','V1','V2','V3')
summary.table = df %>%
group_by(Year, Group) %>%
group_by(N = n(), add = TRUE) %>%
summarise_all(funs(sd,median)) %>%
ungroup %>%
mutate(Year = ifelse(duplicated(Year),"",Year))
Is there a way I could display the values related to the median columns as percentages?
I did not know how to use mutate() and scales::percent() for only a subset of columns (I dont want to do it individually, since there will be more columns in the original dataset, making this procedure not practical enough.
What should I have done instead if I wanted to mutate according to a subset of rows?
Thank you
EDIT:
And if it was like this?
summary.table = df %>%
group_by(Year, Group) %>%
summarise_all(funs(median,sd)) %>%
gather(key, value, -Year, -Group) %>%
separate(key, into=c("var", "stat")) %>%
unite(stat_Group, stat, Group) %>%
spread(stat_Group, value) %>%
ungroup %>%
mutate(Year = ifelse(duplicated(Year),"",Year))
We need to use the percent wrapped on median
summary.table <- df %>%
group_by(Year, Group) %>%
group_by(N = n(), add = TRUE) %>%
summarise_all(funs(sd=sd(.),median=scales::percent(median(.)))) %>%
ungroup %>%
mutate(Year = ifelse(duplicated(Year),"",Year))

Resources