I am working with the R programming language.
I have a dataset that looks something like this:
x = c("GROUP", "A", "B", "C")
date_1 = c("CLASS 1", 20, 60, 82)
date_1_1 = c("CLASS 2", 37, 22, 8)
date_2 = c("CLASS 1", 15,100,76)
date_2_1 = c("CLASS 2", 84, 18,88)
my_data = data.frame(x, date_1, date_1_1, date_2, date_2_1)
x date_1 date_1_1 date_2 date_2_1
1 GROUP CLASS 1 CLASS 2 CLASS 1 CLASS 2
2 A 20 37 15 84
3 B 60 22 100 18
4 C 82 8 76 88
I am trying to restructure the data so it looks like this:
note : in the real excel data, date_1 is the same date as date_1_1 and date_2 is the same as date_2_1 ... R wont accept the same names, so I called them differently
Currently, I am manually doing this in Excel using different "tranpose" functions - but I am wondering if there is a way to do this in R (possibly using the DPLYR library).
I have been trying to read different tutorial websites online (Pivoting), but so far nothing seems to match the problem I am trying to work on.
Can someone please show me how to do this?
Thanks!
Made assumptions about your data because of the duplicate column names. For example, if the Column header pattern is CLASS_ClassNum_Date
df<-data.frame(GROUP = c("A", "B", "C"),
CLASS_1_1 = c(20, 60, 82),
CLASS_2_1 = c(37, 22, 8),
CLASS_1_2 = c(15,100,76),
CLASS_2_2 = c(84, 18,88))
library(tidyr)
pivot_longer(df, -GROUP,
names_pattern = "(CLASS_.*)_(.*)",
names_to = c(".value", "Date"))
GROUP Date CLASS_1 CLASS_2
<chr> <chr> <dbl> <dbl>
1 A 1 20 37
2 A 2 15 84
3 B 1 60 22
4 B 2 100 18
5 C 1 82 8
6 C 2 76 88
Edit: Substantially improved pivot_longer by using names_pattern= correctly
There are lots of ways to achieve your desired outcome, but I don't believe there is an 'easy'/'simple' way. Here is one potential solution:
library(tidyverse)
library(vctrs)
x = c("GROUP", "A", "B", "C")
date_1 = c("CLASS 1", 20, 60, 82)
date_1_1 = c("CLASS 2", 37, 22, 8)
date_2 = c("CLASS 1", 15,100,76)
date_2_1 = c("CLASS 2", 84, 18,88)
my_data = data.frame(x, date_1, date_1_1, date_2, date_2_1)
# Combine column names with the names in the first row
colnames(my_data) <- paste(my_data[1,], colnames(my_data), sep = "-")
my_data %>%
filter(`GROUP-x` != "GROUP") %>% # remove first row (info now in column names)
pivot_longer(everything(), # pivot the data
names_to = c(".value", "Date"),
names_sep = "-") %>%
mutate(GROUP = vec_fill_missing(GROUP, # fill NAs in GROUP introduced by pivoting
direction = "downup")) %>%
filter(Date != "x") %>% # remove "unneeded" rows
mutate(`CLASS 2` = vec_fill_missing(`CLASS 2`, # fill NAs again
direction = "downup")) %>%
na.omit() %>% # remove any remaining NAs
mutate(across(starts_with("CLASS"), ~as.numeric(.x)),
Date = str_extract(Date, "\\d+")) %>%
rename("date" = "Date", # rename the columns
"group" = "GROUP",
"count_class_1" = `CLASS 1`,
"count_class_2" = `CLASS 2`) %>%
arrange(date) # arrange by "date" to get your desired output
#> # A tibble: 6 × 4
#> date group count_class_1 count_class_2
#> <chr> <chr> <dbl> <dbl>
#> 1 1 A 20 37
#> 2 1 B 60 84
#> 3 1 C 82 18
#> 4 2 A 15 37
#> 5 2 B 100 22
#> 6 2 C 76 8
Created on 2022-12-09 with reprex v2.0.2
Related
I am stuck in performing pivot_longer() over multiple sets of columns. Here is the sample dataset
df <- data.frame(
id = c(1, 2),
uid = c("m1", "m2"),
germ_kg = c(23, 24),
mineral_kg = c(12, 17),
perc_germ = c(45, 34),
perc_mineral = c(78, 10))
I need the output dataframe to look like this
out <- df <- data.frame(
id = c(1, 1, 2, 2),
uid = c("m1", "m1", "m2", "m2"),
crop = c("germ", "germ", "mineral", "mineral"),
kg = c(23, 12, 24, 17),
perc = c(45, 78, 34, 10))
df %>%
rename_with(~str_replace(.x,'(.*)_kg', 'kg_\\1')) %>%
pivot_longer(-c(id, uid), names_to = c('.value', 'crop'), names_sep = '_')
# A tibble: 4 x 5
id uid crop kg perc
<dbl> <chr> <chr> <dbl> <dbl>
1 1 m1 germ 23 45
2 1 m1 mineral 12 78
3 2 m2 germ 24 34
4 2 m2 mineral 17 10
If you were to use data.table:
library(data.table)
melt(setDT(df), c('id', 'uid'), patterns(kg = 'kg', perc = 'perc'))
id uid variable kg perc
1: 1 m1 1 23 45
2: 2 m2 1 24 34
3: 1 m1 2 12 78
4: 2 m2 2 17 10
I suspect there might be a simpler way using pivot_long_spec, but one tricky thing here is that your column names don't have a consistent ordering of their semantic components. #Onyambu's answer deals with this nicely by fixing it upsteam.
library(tidyverse)
df %>%
pivot_longer(-c(id, uid)) %>%
separate(name, c("col1", "col2")) %>% # only needed
mutate(crop = if_else(col2 == "kg", col1, col2), # because name
meas = if_else(col2 == "kg", col2, col1)) %>% # structure
select(id, uid, crop, meas, value) %>% # is
pivot_wider(names_from = meas, values_from = value) # inconsistent
# A tibble: 4 x 5
id uid crop kg perc
<dbl> <chr> <chr> <dbl> <dbl>
1 1 m1 germ 23 45
2 1 m1 mineral 12 78
3 2 m2 germ 24 34
4 2 m2 mineral 17 10
I have a data frame that looks like this:
location td1_2019 td2_2019 td3_2019 td4_2019 td1_2020 td2_2020 td3_2020 td4_2020
1 a 50 55 60 58 63 55 60 58
2 b 45 65 57 50 61 66 62 59
3 c 61 66 62 59 45 65 57 50
here, td1_2019 = temperature day1 in 2019 ... and so on
I want count the number of days temperature was above 60 for both 2019 and 2020 for each location. I want the table to look like the following:
location 2019 2020
1 a 1 2
2 b 1 3
3 c 3 1
I am using R, so I would prefer a solution in R. Any help would be appreciated! Thank you!
A dplyr solution
library(dplyr)
df1 %>%
pivot_longer(
-location,
names_to = c("day", "year"),
names_pattern = "td(\\d)_(\\d{4})",
values_to = "temperature"
) %>%
group_by(year, location) %>%
summarise(n = sum(temperature >= 60)) %>%
pivot_wider(names_from = "year", values_from = "n")
A Base R solution
nms <- names(df1)
cond <- df1 >= 60
Reduce(
function(out, y) `[[<-`(out, y, value = rowSums(cond[, which(grepl(y, nms))])),
c("2019", "2020"),
init = df1[, "location", drop = FALSE]
)
Output
location `2019` `2020`
<chr> <int> <int>
1 a 1 2
2 b 1 3
3 c 3 1
Assume that df1 looks like this
> df1
# A tibble: 3 x 9
location td1_2019 td2_2019 td3_2019 td4_2019 td1_2020 td2_2020 td3_2020 td4_2020
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 a 50 55 60 58 63 55 60 58
2 b 45 65 57 50 61 66 62 59
3 c 61 66 62 59 45 65 57 50
Does this work: I think you want something more year wise.
> library(dplyr)
> temp %>% pivot_longer(-location, names_to = c('td', 'year'), names_pattern = '(.*)_(.*)', values_to = 'temp') %>%
+ filter(temp >= 60) %>% count(location, year, name = 'Count') %>%
+ pivot_wider(location, names_from = year, values_from = Count, values_fill = list(Count = 0))
# A tibble: 3 x 3
location `2019` `2020`
<chr> <int> <int>
1 a 1 2
2 b 1 3
3 c 3 1
>
You can use the following tidy solution. Just as in the other solutions posted (which are very nice), a key move is to get the data in a long format using pivot_longer().
library(dplyr)
library(tidyr)
library(stringr)
data %>%
pivot_longer(-location) %>%
mutate(year = str_sub(name, -2)) %>%
group_by(location, year) %>%
mutate(above60 = sum(value >= 60)) %>%
ungroup() %>%
distinct(location, year, above60) %>%
pivot_wider(names_from = year, values_from = above60)
# location `19` `20`
# <chr> <int> <int>
# 1 a 1 2
# 2 b 1 3
# 3 c 3 1
data
structure(list(location = c("a", "b", "c"), td1_2019 = c(50,
45, 61), td2_2019 = c(55, 65, 66), td3_2019 = c(60, 57, 62),
td4_2019 = c(58, 50, 59), td1_2020 = c(63, 61, 45), td2_2020 = c(55,
66, 65), td3_2020 = c(60, 62, 57), td4_2020 = c(58, 59, 50
)), row.names = c(NA, -3L), class = c("tbl_df", "tbl", "data.frame"
))
A base R option
cbind(
df[1],
list2DF(
lapply(
split.default(
as.data.frame(df[-1] >= 60),
gsub(".*?(\\d+)$", "\\1", names(df)[-1],
perl = TRUE
)
),
rowSums
)
)
)
which gives
location 2019 2020
1 a 1 2
2 b 1 3
3 c 3 1
library(tidyverse)
df <- tibble(Date = c(rep(as.Date("2020-01-01"), 3), NA),
col1 = 1:4,
thisCol = c(NA, 8, NA, 3),
thatCol = 25:28,
col999 = rep(99, 4))
#> # A tibble: 4 x 5
#> Date col1 thisCol thatCol col999
#> <date> <int> <dbl> <int> <dbl>
#> 1 2020-01-01 1 NA 25 99
#> 2 2020-01-01 2 8 26 99
#> 3 2020-01-01 3 NA 27 99
#> 4 NA 4 3 28 99
My actual R data frame has hundreds of columns that aren't neatly named, but can be approximated by the df data frame above.
I want to replace all values of NA with 0, with the exception of several columns (in my example I want to leave out the Date column and the thatCol column. I'd want to do it in this sort of fashion:
df %>% replace(is.na(.), 0)
#> Error: Assigned data `values` must be compatible with existing data.
#> i Error occurred for column `Date`.
#> x Can't convert <double> to <date>.
#> Run `rlang::last_error()` to see where the error occurred.
And my unsuccessful ideas for accomplishing the "everything except" replace NA are shown below.
df %>% replace(is.na(c(., -c(Date, thatCol)), 0))
df %>% replace_na(list([, c(2:3, 5)] = 0))
df %>% replace_na(list(everything(-c(Date, thatCol)) = 0))
Is there a way to select everything BUT in the way I need to? There's hundred of columns, named inconsistently, so typing them one by one is not a practical option.
You can use mutate_at :
library(dplyr)
Remove them by Name
df %>% mutate_at(vars(-c(Date, thatCol)), ~replace(., is.na(.), 0))
Remove them by position
df %>% mutate_at(-c(1,4), ~replace(., is.na(.), 0))
Select them by name
df %>% mutate_at(vars(col1, thisCol, col999), ~replace(., is.na(.), 0))
Select them by position
df %>% mutate_at(c(2, 3, 5), ~replace(., is.na(.), 0))
If you want to use replace_na
df %>% mutate_at(vars(-c(Date, thatCol)), tidyr::replace_na, 0)
Note that mutate_at is soon going to be replaced by across in dplyr 1.0.0.
You have several options here based on data.table.
One of the coolest options: setnafill (version >= 1.12.4):
library(data.table)
setDT(df)
data.table::setnafill(df,fill = 0, cols = colnames(df)[!(colnames(df) %in% c("Date", thatCol)]))
Note that your dataframe is updated by reference.
Another base solution:
to_change<-grep("^(this|col)",names(df))
df[to_change]<- sapply(df[to_change],function(x) replace(x,is.na(x),0))
df
# A tibble: 4 x 5
Date col1 thisCol thatCol col999
<date> <dbl> <dbl> <int> <dbl>
1 2020-01-01 1 0 25 99
2 2020-01-01 2 8 26 99
3 2020-01-01 3 0 27 99
4 NA 0 3 28 99
Data(I changed one value):
df <- structure(list(Date = structure(c(18262, 18262, 18262, NA), class = "Date"),
col1 = c(1L, 2L, 3L, NA), thisCol = c(NA, 8, NA, 3), thatCol = 25:28,
col999 = c(99, 99, 99, 99)), row.names = c(NA, -4L), class = c("tbl_df",
"tbl", "data.frame"))
replace works on a data.frame, so we can just do the replacement by index and update the original dataset
df[-c(1, 4)] <- replace(df[-c(1, 4)], is.na(df[-c(1, 4)]), 0)
Or using replace_na with across (from the new dplyr)
library(dplyr)
library(tidyr)
df %>%
mutate(across(-c(Date, thatCol), ~ replace_na(., 0)))
If you know the ones that you don't want to change, you could do it like this:
df <- tibble(Date = c(rep(as.Date("2020-01-01"), 3), NA),
col1 = 1:4,
thisCol = c(NA, 8, NA, 3),
thatCol = 25:28,
col999 = rep(99, 4))
#dplyr
df_nonreplace <- select(df, c("Date", "thatCol"))
df_replace <- df[ ,!names(df) %in% names(df_nonreplace)]
df_replace[is.na(df_replace)] <- 0
df <- cbind(df_nonreplace, df_replace)
> head(df)
Date thatCol col1 thisCol col999
1 2020-01-01 25 1 0 99
2 2020-01-01 26 2 8 99
3 2020-01-01 27 3 0 99
4 <NA> 28 4 3 99
Here is my toy data.
df <- tibble::tribble(
~date1, ~`A Equity`, ~date2, ~`B Equity`, ~date3, ~`C Equity`,
"1/29/2016", 35, "10/31/2017", 67, NA_character_, NA_real_,
"2/29/2016", 40, "11/30/2017", 31, NA_character_, NA_real_,
NA_character_, NA_real_, "12/29/2017", 56, NA_character_, NA_real_)
The real one has over 1000 columns and many more dates.
I want to long the data so that the desired output has only date, var, and value columns as shown below:
desired_df <- tibble::tribble(
~date, ~var, ~value,
"1/29/2016", "A", 35,
"2/29/2016", "A", 40,
"10/31/2017", "B", 67,
"11/30/2017", "B", 31,
"12/29/2017", "B", 56)
I tried this, but am not getting the desired result:
df2 <- df %>%
pivot_longer(cols = contains("date"), names_to = "dates", values_to = "date") %>%
pivot_longer (cols = contains("Equity"), names_to = "var", values_to = "value") %>%
select(-dates) %>%
distinct() %>%
filter(!is.na(date))
If names_to is a character vector containing the special element ".value", the values_to value will be ignored, and the name of the value column will be derived from part of the existing column names.
library(tidyverse)
df %>%
rename_with(~ str_c(LETTERS[as.integer(str_extract(.x, "\\d+"))], " date"), starts_with("date")) %>%
pivot_longer(everything(),
names_to = c("var", ".value"),
names_sep = " ",
values_drop_na = TRUE)
# # A tibble: 5 × 3
# var date Equity
# <chr> <chr> <dbl>
# 1 A 1/29/2016 35
# 2 B 10/31/2017 67
# 3 A 2/29/2016 40
# 4 B 11/30/2017 31
# 5 B 12/29/2017 56
Base R solution using reshape (yes, it's still in there).
setNames(
na.omit(
reshape(
as.data.frame(df), direction="long", varying=1:6, sep="")),
c("var","date","value","id"))
var date value id
1.1 1 1/29/2016 35 1
2.1 1 2/29/2016 40 2
1.2 2 10/31/2017 67 1
2.2 2 11/30/2017 31 2
3.2 2 12/29/2017 56 3
And if your "toy" data has more columns, just change the varying to:
grep("^[var|date]", names(df))
My data frame looks like this:
id A T C G ref var
1 1 10 15 7 0 A C
2 2 11 9 2 3 A G
3 3 2 31 1 12 T C
I'd like to create two new columns: ref_count and var_count which will have following values:
Value from A column and value from C column, since ref is A and var is C
Value from A column and value from G column, since ref is A and var is G
etc.
So I'd like to select a column based on the value in another column for each row.
Thanks!
We can use pivot_longer to reshape into 'long' format, filter the rows and then reshape it to 'wide' format with pivot_wider
library(dplyr)
library(tidyr)
df1 %>%
pivot_longer(cols = A:G) %>%
group_by(id) %>%
filter(name == ref|name == var) %>%
mutate(nm1 = c('ref_count', 'var_count')) %>%
ungroup %>%
select(id, value, nm1) %>%
pivot_wider(names_from = nm1, values_from = value) %>%
left_join(df1, .)
# A tibble: 3 x 9
# id A T C G ref var ref_count var_count
#* <int> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <dbl> <dbl>
#1 1 10 15 7 0 A C 10 7
#2 2 11 9 2 3 A G 11 3
#3 3 2 31 1 12 T C 31 1
Or in base R, we can also make use of the vectorized row/column indexing
df1$refcount <- as.matrix(df1[2:5])[cbind(seq_len(nrow(df1)), match(df1$ref, names(df1)[2:5]))]
df1$var_count <- as.matrix(df1[2:5])[cbind(seq_len(nrow(df1)), match(df1$var, names(df1)[2:5]))]
data
df1 <- structure(list(id = 1:3, A = c(10, 11, 2), T = c(15, 9, 31),
C = c(7, 2, 1), G = c(0, 3, 12), ref = c("A", "A", "T"),
var = c("C", "G", "C")), row.names = c(NA, -3L), class = c("tbl_df",
"tbl", "data.frame"))
The following is a tidyverse alternative without creating a long dataframe that needs filtering. It essentially uses tidyr::nest() to nest the dataframe by rows, after which the correct column can be selected for each row.
df1 %>%
nest(data = -id) %>%
mutate(
data = map(
data,
~mutate(., refcount = .[[ref]], var_count = .[[var]])
)
) %>%
unnest(data)
#> # A tibble: 3 × 9
#> id A T C G ref var refcount var_count
#> <int> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <dbl> <dbl>
#> 1 1 10 15 7 0 A C 10 7
#> 2 2 11 9 2 3 A G 11 3
#> 3 3 2 31 1 12 T C 31 1
A variant of this does not need the (assumed row-specific) id column but defines the nested groups from the unique values of ref and var directly:
df1 %>%
nest(data = -c(ref, var)) %>%
mutate(
data = pmap(
list(data, ref, var),
function(df, ref, var) {
mutate(df, refcount = df[[ref]], var_count = df[[var]])
}
)
) %>%
unnest(data)
The data were specified by akrun:
df1 <- structure(list(id = 1:3, A = c(10, 11, 2), T = c(15, 9, 31),
C = c(7, 2, 1), G = c(0, 3, 12), ref = c("A", "A", "T"),
var = c("C", "G", "C")), row.names = c(NA, -3L), class = c("tbl_df",
"tbl", "data.frame"))