one hot encoding only factor variables in R recipes - r

I have a dataframe df like so
height age dept
69 18 A
44 8 B
72 19 B
58 34 C
I want to one-hot encode only the factor variables (only dept is a factor). How can i do this?
Currently right now I'm selecting everything..
and getting this warning:
Warning message:
The following variables are not factor vectors and will be ignored: height, age
ohe <- df %>%
recipes::recipe(~ .) %>%
recipes::step_dummy(tidyselect::everything()) %>%
recipes::prep() %>%
recipes::bake(df)

Use the where with is.factor instead of everything
library(dplyr)
df %>%
recipes::recipe(~ .) %>%
recipes::step_dummy(tidyselect:::where(is.factor)) %>%
recipes::prep() %>%
recipes::bake(df)
-output
# A tibble: 4 × 4
height age dept_B dept_C
<int> <int> <dbl> <dbl>
1 69 18 0 0
2 44 8 1 0
3 72 19 1 0
4 58 34 0 1
data
df <- structure(list(height = c(69L, 44L, 72L, 58L), age = c(18L, 8L,
19L, 34L), dept = structure(c(1L, 2L, 2L, 3L), .Label = c("A",
"B", "C"), class = "factor")), row.names = c(NA, -4L), class = "data.frame")

Related

Transform columns and rows of a dataframe [duplicate]

This question already has answers here:
How to reshape data from long to wide format
(14 answers)
Closed 2 years ago.
I have a dataframe:
ID Value Name Score Card_type Card_number
1 NA John 242 X 23
1 124 John NA X 23
1 124 John 242 Y 25
1 124 NA 242 Y NA
2 55 Mike NA X 11
2 55 NA 431 X 11
2 55 Mike 431 Y 14
2 NA Mike 431 Y 14
As you see, there are IDs and each of them has two groups (Card_type) for column Card_number. Also as you see, some rows with same ID and Card_type have same missing values in some columns. What I want to get is, to make each ID be one row with filled columns. And column Card_number must be split into two columns Card_number_type_X and Card_number_type_X and column Card_type must be removed.
So the desired result must look like this:
ID Value Name Score Card_number_type_X Card_number_type_Y
1 124 John 242 23 25
2 55 Mike 431 11 14
How could I do that?
One way would be to fill the missing values in each ID and then get data in wide format keeping only the unique values.
library(dplyr)
library(tidyr)
df %>%
group_by(ID) %>%
fill(everything(), .direction = 'updown') %>%
pivot_wider(names_from = Card_type, values_from = Card_number,
values_fn = unique, names_prefix = 'Card_number_type_')
# ID Value Name Score Card_number_type_X Card_number_type_Y
# <int> <int> <chr> <int> <int> <int>
#1 1 124 John 242 23 25
#2 2 55 Mike 431 11 14
It seems original data is not the same as shared data in which case we can try :
df %>%
group_by(ID) %>%
fill(everything(), .direction = 'updown') %>%
distinct() %>%
group_by(ID, Value, Name, Score) %>%
mutate(row = row_number()) %>%
pivot_wider(names_from = Card_type, values_from = Card_number,
names_prefix = 'Card_number_type_')
data
df <- structure(list(ID = c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L), Value = c(NA,
124L, 124L, 124L, 55L, 55L, 55L, NA), Name = c("John", "John",
"John", NA, "Mike", NA, "Mike", "Mike"), Score = c(242L, NA,
242L, 242L, NA, 431L, 431L, 431L), Card_type = c("X", "X", "Y",
"Y", "X", "X", "Y", "Y"), Card_number = c(23L, 23L, 25L, NA,
11L, 11L, 14L, 14L)), class = "data.frame", row.names = c(NA,
-8L))

Aggregating columns based on columns name in R

I have this dataframe in R
Party Pro2005 Anti2005 Pro2006 Anti2006 Pro2007 Anti2007
R 1 18 0 7 2 13
R 1 19 0 7 1 14
D 13 7 3 4 10 5
D 12 8 3 4 9 6
I want to aggregate it to where it will combined all the pros and anti based on party
for example
Party ProSum AntiSum
R. 234. 245
D. 234. 245
How would I do that in R?
You can use:
library(tidyverse)
df %>%
pivot_longer(-Party,
names_to = c(".value", NA),
names_pattern = "([a-zA-Z]*)([0-9]*)") %>%
group_by(Party) %>%
summarise(across(where(is.numeric), sum, na.rm = T))
# A tibble: 2 x 3
Party Pro Anti
<chr> <int> <int>
1 D 50 34
2 R 5 78
I would suggest a tidyverse approach reshaping the data and the computing the sum of values:
library(tidyverse)
#Data
df <- structure(list(Party = c("R", "R", "D", "D"), Pro2005 = c(1L,
1L, 13L, 12L), Anti2005 = c(18L, 19L, 7L, 8L), Pro2006 = c(0L,
0L, 3L, 3L), Anti2006 = c(7L, 7L, 4L, 4L), Pro2007 = c(2L, 1L,
10L, 9L), Anti2007 = c(13L, 14L, 5L, 6L)), class = "data.frame", row.names = c(NA,
-4L))
The code:
df %>% pivot_longer(cols = -1) %>%
#Format strings
mutate(name=gsub('\\d+','',name)) %>%
#Aggregate
group_by(Party,name) %>% summarise(value=sum(value,na.rm=T)) %>%
pivot_wider(names_from = name,values_from=value)
The output:
# A tibble: 2 x 3
# Groups: Party [2]
Party Anti Pro
<chr> <int> <int>
1 D 34 50
2 R 78 5
Splitting by parties and loop sum over the pro/anti using sapply, finally rbind.
res <- data.frame(Party=sort(unique(d$Party)), do.call(rbind, by(d, d$Party, function(x)
sapply(c("Pro", "Anti"), function(y) sum(x[grep(y, names(x))])))))
res
# Party Pro Anti
# D D 50 34
# R R 5 78
An outer solution is also suitable.
t(outer(c("Pro", "Anti"), c("R", "D"),
Vectorize(function(x, y) sum(d[d$Party %in% y, grep(x, names(d))]))))
# [,1] [,2]
# [1,] 5 78
# [2,] 50 34
Data:
d <- read.table(header=T, text="Party Pro2005 Anti2005 Pro2006 Anti2006 Pro2007 Anti2007
R 1 18 0 7 2 13
R 1 19 0 7 1 14
D 13 7 3 4 10 5
D 12 8 3 4 9 6 ")

How to group contiguous variable into a range r

I have an example dataset:
Road Start End Cat
1 0 50 a
1 50 60 b
1 60 90 b
1 70 75 a
2 0 20 a
2 20 25 a
2 25 40 b
Trying to output following:
Road Start End Cat
1 0 50 a
1 50 90 b
1 70 75 a
2 0 25 a
2 25 40 b
My code doesn't work:
df %>% group_by(Road, cat)
%>% summarise(
min(Start),
max(End)
)
How can I achieve the results I wanted?
We can use rleid from data.table to get the run-length-id-encoding for grouping and then do the summarise
library(dplyr)
library(data.table)
df %>%
group_by(Road, grp = rleid(Cat)) %>%
summarise(Cat = first(Cat), Start = min(Start), End = max(End)) %>%
select(-grp)
# A tibble: 5 x 4
# Groups: Road [2]
# Road Cat Start End
# <int> <chr> <int> <int>
#1 1 a 0 50
#2 1 b 50 90
#3 1 a 70 75
#4 2 a 0 25
#5 2 b 25 40
Or using data.table methods
library(data.table)
setDT(df)[, .(Start = min(Start), End = max(End)), .(Road, Cat, grp = rleid(Cat))]
data
df <- structure(list(Road = c(1L, 1L, 1L, 1L, 2L, 2L, 2L), Start = c(0L,
50L, 60L, 70L, 0L, 20L, 25L), End = c(50L, 60L, 90L, 75L, 20L,
25L, 40L), Cat = c("a", "b", "b", "a", "a", "a", "b")),
class = "data.frame", row.names = c(NA,
-7L))

How would I add a Total Row for each value in a specific column, that does calculations based upon other columns,

Assume I have this data frame
What I want is this
What I want to do is create rows which groups upon the month variable, which then obtains the sum of the total variable, and the unique value of the days_month variable for all of the values in person for that month.
I am just wondering if there is an easy way to do this that does not involve multiple spreads and gathers with adorn totals that I have to change the days in month back to original value after the totals were summed, etc. Is there a quick and easy way to do this?
One option would be to group by 'month', 'days_in_month' and apply adorn_total by group_mapping
library(dplyr)
library(janitor)
df1 %>%
group_by(month, days_in_month) %>%
group_map(~ .x %>%
adorn_totals("row")) %>%
select(names(df1))
# A tibble: 10 x 4
# Groups: month, days_in_month [2]
# month person total days_in_month
# <int> <chr> <int> <int>
# 1 1 John 7 31
# 2 1 Jane 18 31
# 3 1 Tim 20 31
# 4 1 Cindy 11 31
# 5 1 Total 56 31
# 6 2 John 18 28
# 7 2 Jane 13 28
# 8 2 Tim 15 28
# 9 2 Cindy 9 28
#10 2 Total 55 28
If we need other statistics, we can have it in group_map
library(tibble)
df1 %>%
group_by(month, days_in_month) %>%
group_map(~ bind_rows(.x, tibble(person = "Mean", total = mean(.x$total))))
data
df1 <- structure(list(month = c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L), person = c("John",
"Jane", "Tim", "Cindy", "John", "Jane", "Tim", "Cindy"), total = c(7L,
18L, 20L, 11L, 18L, 13L, 15L, 9L), days_in_month = c(31L, 31L,
31L, 31L, 28L, 28L, 28L, 28L)), class = "data.frame", row.names = c(NA,
-8L))

Calculate rowMeans on a range of column (Variable number)

I want to calculate rowMeans of a range of column but I cannot give the hard-coded value for colnames (e.g c(C1,C3)) or range (e.g. C1:C3) as both names and range are variable. My df looks like:
> df
chr name age MGW.1 MGW.2 MGW.3 HEL.1 HEL.2 HEL.3
1 123 abc 12 10.00 19 18.00 12 13.00 -14
2 234 bvf 24 -13.29 13 -3.02 12 -0.12 24
3 376 bxc 17 -6.95 10 -18.00 15 4.00 -4
This is just a sample, in reality I have columns ranging in MGW.1 ... MGW.196 and so. Here Instead of giving the exact colnames or an exact range I want to pass initial of colnames and want to get average of all columns having that initials. Something like: MGW=rowMeans(df[,MGW.*]), HEL=rowMeans(df[,HEL.*])
So my final output should look like:
> df
chr name age MGW Hel
1 123 abc 12 10.00 19
2 234 bvf 24 13.29 13
3 376 bxc 17 -6.95 10
I know these values are not correct but it is just to give you and idea. Secondly I want to remove all those rows from data frame which contains NA in the entire row except the first 3 values.
Here is the dput for sample example:
> dput(df)
structure(list(chr = c(123L, 234L, 376L), name = structure(1:3, .Label = c("abc",
"bvf", "bxc"), class = "factor"), age = c(12L, 24L, 17L), MGW.1 = c(10,
-13.29, -6.95), MGW.2 = c(19L, 13L, 10L), MGW.3 = c(18, -3.02,
-18), HEL.1 = c(12L, 12L, 15L), HEL.2 = c(13, -0.12, 4), HEL.3 = c(-14L,
24L, -4L)), .Names = c("chr", "name", "age", "MGW.1", "MGW.2",
"MGW.3", "HEL.1", "HEL.2", "HEL.3"), class = "data.frame", row.names = c(NA,
-3L))
Firstly
I think you are looking for this to get mean of rows:
df$mean.Hel <- rowMeans(df[, grep("^HEL.", names(df))])
And to delete the columns afterwards:
df[, grep("^HEL.", names(df))] <- NULL
Secondly
To delete rows which have only NA after the first three elements.
rows.delete <- which(rowSums(!is.na(df)[,4:ncol(df)]) == 0)
df <- df[!(1:nrow(df) %in% rows.delete),]
Here's an idea achieving your desired output without hardcoding variable names:
library(dplyr)
library(tidyr)
df %>%
# remove rows where all values are NA except the first 3 columns
filter(rowSums(is.na(.[4:length(.)])) != length(.) - 3) %>%
# gather the data in a tidy format
gather(key, value, -(chr:age)) %>%
# separate the key column into label and num allowing
# to regroup by variables without hardcoding them
separate(key, into = c("label", "num")) %>%
group_by(chr, name, age, label) %>%
# calculate the mean
summarise(mean = mean(value, na.rm = TRUE)) %>%
spread(label, mean)
I took the liberty to modify your initial data to show how the logic would fit special cases. For example, here we have a row (#4) where all values but the first 3 columns are NAs (according to your requirements, this row should be removed) and one where there is a mix of NAs and values (#5). In this case, I assumed we would like to have a result for MGW since there is a value at MGW.1:
# chr name age MGW.1 MGW.2 MGW.3 HEL.1 HEL.2 HEL.3
#1 123 abc 12 10.00 19 18.00 12 13.00 -14
#2 234 bvf 24 -13.29 13 -3.02 12 -0.12 24
#3 376 bxc 17 -6.95 10 -18.00 15 4.00 -4
#4 999 zzz 21 NA NA NA NA NA NA
#5 888 aaa 12 10.00 NA NA NA NA NA
Which gives:
#Source: local data frame [4 x 5]
#Groups: chr, name, age [4]
#
# chr name age HEL MGW
#* <int> <fctr> <int> <dbl> <dbl>
#1 123 abc 12 3.666667 15.666667
#2 234 bvf 24 11.960000 -1.103333
#3 376 bxc 17 5.000000 -4.983333
#4 888 aaa 12 NaN 10.000000
Data
df <- structure(list(chr = c(123L, 234L, 376L, 999L, 888L), name = structure(c(2L,
3L, 4L, 5L, 1L), .Label = c("aaa", "abc", "bvf", "bxc", "zzz"
), class = "factor"), age = c(12L, 24L, 17L, 21L, 12L), MGW.1 = c(10,
-13.29, -6.95, NA, 10), MGW.2 = c(19L, 13L, 10L, NA, NA), MGW.3 = c(18,
-3.02, -18, NA, NA), HEL.1 = c(12L, 12L, 15L, NA, NA), HEL.2 = c(13,
-0.12, 4, NA, NA), HEL.3 = c(-14L, 24L, -4L, NA, NA)), .Names = c("chr",
"name", "age", "MGW.1", "MGW.2", "MGW.3", "HEL.1", "HEL.2", "HEL.3"
), class = "data.frame", row.names = c("1", "2", "3", "4", "5"))

Resources