How I can split and sort this data set - r

Here is a small sample size of my data:
var colour no Mcolour Ncolour
sa1_fr_19 B 10 66 3
sa1_fr_19 W 12 85 6
su3_sa2_18 B 8 70 9
su3_sa2_18 W 6 24 1
I want to get this table:
year var sort nB McolourB NcolourB nW McolourW NcolourW
19 sa1 fr 10 66 3 12 85 6
18 su3 sa2 8 70 9 6 24 1
It would be good if we could do it using base R codes

Split the columns on '_' and use pivot_wider.
library(magrittr)
library(tidyr)
df %>%
separate(var, c('var', 'sort', 'year'), sep = '_') %>%
pivot_wider(names_from = colour, values_from = c(no, Mcolour, Ncolour), names_sep = '')
# var sort year noB noW McolourB McolourW NcolourB NcolourW
# <chr> <chr> <chr> <int> <int> <int> <int> <int> <int>
#1 sa1 fr 19 10 12 66 85 3 6
#2 su3 sa2 18 8 6 70 24 9 1
data
df <- structure(list(var = c("sa1_fr_19", "sa1_fr_19", "su3_sa2_18",
"su3_sa2_18"), colour = c("B", "W", "B", "W"), no = c(10L, 12L,
8L, 6L), Mcolour = c(66L, 85L, 70L, 24L), Ncolour = c(3L, 6L,
9L, 1L)), class = "data.frame", row.names = c(NA, -4L))

Using data.table
library(splitstackshape)
library(data.table)
dcast(cSplit(df, "var", sep="_"), var_1 + var_2 + var_3 ~ colour,
value.var = c("no", "Mcolour", "Ncolour"))
var_1 var_2 var_3 no_B no_W Mcolour_B Mcolour_W Ncolour_B Ncolour_W
1: sa1 fr 19 10 12 66 85 3 6
2: su3 sa2 18 8 6 70 24 9 1
data
df <- structure(list(var = c("sa1_fr_19", "sa1_fr_19", "su3_sa2_18",
"su3_sa2_18"), colour = c("B", "W", "B", "W"), no = c(10L, 12L,
8L, 6L), Mcolour = c(66L, 85L, 70L, 24L), Ncolour = c(3L, 6L,
9L, 1L)), class = "data.frame", row.names = c(NA, -4L))

Related

Tidyverse method for combining sets of columns based on a condition in the column names

Imagine I have the following columns (among others) in my dataframe (credit to Allan for creating the sample data):
20L, 15L), b_years = c(4L, 5L, 3L), b_months = 0:2, b_days = c(10L,
8L, 6L), c_years = 8:6, c_months = c(11L, 9L, 8L), c_days = c(26L,
19L, 18L)), class = "data.frame", row.names = c(NA, -3L))
df
#> a_years a_months a_days b_years b_months b_days c_years c_months c_days
#> 1 5 6 23 4 0 10 8 11 26
#> 2 4 7 20 5 1 8 7 9 19
#> 3 3 8 15 3 2 6 6 8 18
And I want to combine columns that start with the same grouping key (in this case the letter at the beginning, but in my data it's a longer expression) such that I get columns a_days, b_days, c_days and so on with values in eahc column equal to x_years * 365 + x_months * 30 + x_days, for each group (a, b, c, d, e and so on) of columns.
Is there a way to accomplish this all at once? Some combination of map() and mutate() comes to mind, or maybe using case_when(), but I can't quite figure it out. Thanks for any guidance you can offer!
You can do this with across inside transmute:
library(dplyr)
df %>%
transmute(across(contains("days"), ~ .x) +
across(contains("months"), ~ .x * 30) +
across(contains("years"), ~ .x * 365))
#> a_days b_days c_days
#> 1 2028 1470 3276
#> 2 1690 1863 2844
#> 3 1350 1161 2448
Sample data
df <- structure(list(a_years = 5:3, a_months = 6:8, a_days = c(23L,
20L, 15L), b_years = c(4L, 5L, 3L), b_months = 0:2, b_days = c(10L,
8L, 6L), c_years = 8:6, c_months = c(11L, 9L, 8L), c_days = c(26L,
19L, 18L)), class = "data.frame", row.names = c(NA, -3L))
df
#> a_years a_months a_days b_years b_months b_days c_years c_months c_days
#> 1 5 6 23 4 0 10 8 11 26
#> 2 4 7 20 5 1 8 7 9 19
#> 3 3 8 15 3 2 6 6 8 18
Created on 2022-09-29 with reprex v2.0.2

Create a variable that indicates the source of the data with R

I want to create a variable that indicates in which dataframe an observation is located (identified with the variable "code").
I have this database:
id code var1 var2 var9
1 1 a 3 5 4
2 2 b 4 54 5435
3 3 c 44 5 5
4 4 d 5 5 54
5 5 e 6 5 6
6 6 f 6 5 6
And these dataframes:
df1
code var2 var3
1 a 23 4
2 e 45 6
3 k 56 98
df2
code var2 var3
1 b 324 4343
2 z 34 545
3 q 545 6
4 j 77 67
df3
code var2 var3
1 c 1 1
2 l 78 56
df4
code var2 var3
1 d 2 2
2 j 1 1
df5
code var2 var3
1 f 5335 343
My expected result:
id code var1 var2 var9 source
1 a 3 5 4 df1
2 b 4 54 5435 df2
3 c 44 5 5 df3
4 d 5 5 54 df4
5 e 6 5 6 df1
6 f 6 5 6 df5
Data
df <- structure(list(id = 1:6, code = c("a", "b", "c", "d", "e", "f"), var1 = c(3L, 4L, 44L, 5L, 6L, 6L), var2 = c(5L, 54L, 5L, 5L, 5L, 5L), var9 = c(4L, 5435L, 5L, 54L, 6L, 6L)), class = "data.frame", row.names = c(NA, -6L))
df1 <- structure(list(code = c("a", "e", "k"), var2 = c(23L, 45L, 56L), var3 = c(4L, 6L, 98L)), class = "data.frame", row.names = c(NA, -3L))
df2 <- structure(list(code = c("b", "z", "q", "j"), var2 = c(324L, 34L, 545L, 77L), var3 = c(4343L, 545L, 6L, 67L)), class = "data.frame", row.names = c(NA, -4L))
df3 <- structure(list(code = c("c", "l"), var2 = c(1L, 78L), var3 = c(1L, 56L)), class = "data.frame", row.names = c(NA, -2L))
df4 <- structure(list(code = c("d", "j"), var2 = 2:1, var3 = 2:1), class = "data.frame", row.names = c(NA, -2L))
df5 <- structure(list(code = "f", var2 = 5335L, var3 = 343L), class = "data.frame", row.names = c(NA, -1L))
You can use bind_rows from dplyr:
library(dplyr)
bind_rows(df1 = df1, df2 = df2, df3 = df3, df4 = df4, df5 = df5, .id = 'source')
#> source code var2 var3
#> 1 df1 a 23 4
#> 2 df1 e 45 6
#> 3 df1 k 56 98
#> 4 df2 b 324 4343
#> 5 df2 z 34 545
#> 6 df2 q 545 6
#> 7 df2 j 77 67
#> 8 df3 c 1 1
#> 9 df3 l 78 56
#> 10 df4 d 2 2
#> 11 df4 j 1 1
#> 12 df5 f 5335 343

one hot encoding only factor variables in R recipes

I have a dataframe df like so
height age dept
69 18 A
44 8 B
72 19 B
58 34 C
I want to one-hot encode only the factor variables (only dept is a factor). How can i do this?
Currently right now I'm selecting everything..
and getting this warning:
Warning message:
The following variables are not factor vectors and will be ignored: height, age
ohe <- df %>%
recipes::recipe(~ .) %>%
recipes::step_dummy(tidyselect::everything()) %>%
recipes::prep() %>%
recipes::bake(df)
Use the where with is.factor instead of everything
library(dplyr)
df %>%
recipes::recipe(~ .) %>%
recipes::step_dummy(tidyselect:::where(is.factor)) %>%
recipes::prep() %>%
recipes::bake(df)
-output
# A tibble: 4 × 4
height age dept_B dept_C
<int> <int> <dbl> <dbl>
1 69 18 0 0
2 44 8 1 0
3 72 19 1 0
4 58 34 0 1
data
df <- structure(list(height = c(69L, 44L, 72L, 58L), age = c(18L, 8L,
19L, 34L), dept = structure(c(1L, 2L, 2L, 3L), .Label = c("A",
"B", "C"), class = "factor")), row.names = c(NA, -4L), class = "data.frame")

Aggregating columns based on columns name in R

I have this dataframe in R
Party Pro2005 Anti2005 Pro2006 Anti2006 Pro2007 Anti2007
R 1 18 0 7 2 13
R 1 19 0 7 1 14
D 13 7 3 4 10 5
D 12 8 3 4 9 6
I want to aggregate it to where it will combined all the pros and anti based on party
for example
Party ProSum AntiSum
R. 234. 245
D. 234. 245
How would I do that in R?
You can use:
library(tidyverse)
df %>%
pivot_longer(-Party,
names_to = c(".value", NA),
names_pattern = "([a-zA-Z]*)([0-9]*)") %>%
group_by(Party) %>%
summarise(across(where(is.numeric), sum, na.rm = T))
# A tibble: 2 x 3
Party Pro Anti
<chr> <int> <int>
1 D 50 34
2 R 5 78
I would suggest a tidyverse approach reshaping the data and the computing the sum of values:
library(tidyverse)
#Data
df <- structure(list(Party = c("R", "R", "D", "D"), Pro2005 = c(1L,
1L, 13L, 12L), Anti2005 = c(18L, 19L, 7L, 8L), Pro2006 = c(0L,
0L, 3L, 3L), Anti2006 = c(7L, 7L, 4L, 4L), Pro2007 = c(2L, 1L,
10L, 9L), Anti2007 = c(13L, 14L, 5L, 6L)), class = "data.frame", row.names = c(NA,
-4L))
The code:
df %>% pivot_longer(cols = -1) %>%
#Format strings
mutate(name=gsub('\\d+','',name)) %>%
#Aggregate
group_by(Party,name) %>% summarise(value=sum(value,na.rm=T)) %>%
pivot_wider(names_from = name,values_from=value)
The output:
# A tibble: 2 x 3
# Groups: Party [2]
Party Anti Pro
<chr> <int> <int>
1 D 34 50
2 R 78 5
Splitting by parties and loop sum over the pro/anti using sapply, finally rbind.
res <- data.frame(Party=sort(unique(d$Party)), do.call(rbind, by(d, d$Party, function(x)
sapply(c("Pro", "Anti"), function(y) sum(x[grep(y, names(x))])))))
res
# Party Pro Anti
# D D 50 34
# R R 5 78
An outer solution is also suitable.
t(outer(c("Pro", "Anti"), c("R", "D"),
Vectorize(function(x, y) sum(d[d$Party %in% y, grep(x, names(d))]))))
# [,1] [,2]
# [1,] 5 78
# [2,] 50 34
Data:
d <- read.table(header=T, text="Party Pro2005 Anti2005 Pro2006 Anti2006 Pro2007 Anti2007
R 1 18 0 7 2 13
R 1 19 0 7 1 14
D 13 7 3 4 10 5
D 12 8 3 4 9 6 ")

How to group contiguous variable into a range r

I have an example dataset:
Road Start End Cat
1 0 50 a
1 50 60 b
1 60 90 b
1 70 75 a
2 0 20 a
2 20 25 a
2 25 40 b
Trying to output following:
Road Start End Cat
1 0 50 a
1 50 90 b
1 70 75 a
2 0 25 a
2 25 40 b
My code doesn't work:
df %>% group_by(Road, cat)
%>% summarise(
min(Start),
max(End)
)
How can I achieve the results I wanted?
We can use rleid from data.table to get the run-length-id-encoding for grouping and then do the summarise
library(dplyr)
library(data.table)
df %>%
group_by(Road, grp = rleid(Cat)) %>%
summarise(Cat = first(Cat), Start = min(Start), End = max(End)) %>%
select(-grp)
# A tibble: 5 x 4
# Groups: Road [2]
# Road Cat Start End
# <int> <chr> <int> <int>
#1 1 a 0 50
#2 1 b 50 90
#3 1 a 70 75
#4 2 a 0 25
#5 2 b 25 40
Or using data.table methods
library(data.table)
setDT(df)[, .(Start = min(Start), End = max(End)), .(Road, Cat, grp = rleid(Cat))]
data
df <- structure(list(Road = c(1L, 1L, 1L, 1L, 2L, 2L, 2L), Start = c(0L,
50L, 60L, 70L, 0L, 20L, 25L), End = c(50L, 60L, 90L, 75L, 20L,
25L, 40L), Cat = c("a", "b", "b", "a", "a", "a", "b")),
class = "data.frame", row.names = c(NA,
-7L))

Resources