Melt a dataframe by different column names - r

I have a data frame of the following form:
a <- data.frame(list(X1=c("stn", "s1", "stn", "s2"),
X2=c("var1", "1", "var4", "2"),
X3=c("var2", "2", "var3", "3"),
X4=c("NA", "NA", "var2", "2")))
X1 X2 X3 X4
1 stn var1 var2 NA
2 s1 1 2 NA
3 stn var4 var3 var2
4 s2 2 3 2
How can I get the result:
b <- data.frame(list(stn=c("s1", "s2"),
var1=c(1, NA),
var2=c(2, 2),
var3=c(NA, 3),
var4=c(NA, 2)))
stn var1 var2 var3 var4
1 s1 1 2 NA NA
2 s2 NA 2 3 2

A (mostly) base R solution could be split the data.frame and call janitor::row_to_names(1) on each split data.frame, recombine it and remove bad columns using subset
do.call(dplyr::bind_rows,
df |>
split(cumsum(df$X1 == "stn")) |>
lapply(\(x) x |>
janitor::row_to_names(1))
) |>
subset(, -`NA`)
stn var1 var2 var4 var3
1 s1 1 2 <NA> <NA>
2 s2 <NA> 2 2 3

A dplyr solution:
library(dplyr)
a %>%
group_by(grp = rep(1:2, each = 2)) %>%
group_map(~ setNames(.x[-1, ], .x[1, ])) %>%
bind_rows() %>%
select(-`NA`) %>%
mutate(across(var1:var3, as.numeric))
# # A tibble: 2 × 5
# stn var1 var2 var4 var3
# <chr> <dbl> <dbl> <dbl> <dbl>
# 1 s1 1 2 NA NA
# 2 s2 NA 2 2 3

Related

Filter by group and conditions

I have this type of data, where Sequis a grouping variable:
df <- data.frame(
Sequ = c(1,1,1,
2,2,2,
3,3,
4,4),
Answerer = c("A", NA, NA, "A", NA, NA, "B", NA, "C", NA),
PP_by = c(rep("A",5), rep("B",5)),
pp = c(0.1,0.2,0.3, 1, NA, NA, NA, NA, NA, NA)
)
I need to remove any Sequ where
(i) Answerer == PP_by AND
(ii) there is any NA in pp
I've tried this, but it obviously implements just the first condition (i):
library(dplyr)
df %>%
group_by(Sequ) %>%
filter(
all(!is.na(pp))
)
The expected result is:
Sequ Answerer PP_by pp
1 1 A A 0.1
2 1 <NA> A 0.2
3 1 <NA> A 0.3
9 4 C B NA
10 4 <NA> B NA
EDIT:
I've come up with this solution:
df %>%
group_by(Sequ) %>%
filter(
first(Answerer) != first(PP_by)
|
all(!is.na(pp))
)
Here's another way:
df %>%
group_by(Sequ) %>%
filter(!(
any(Answerer == PP_by, na.rm = TRUE) &
any(is.na(pp))
))
# # A tibble: 5 × 4
# # Groups: Sequ [2]
# Sequ Answerer PP_by pp
# <dbl> <chr> <chr> <dbl>
# 1 1 A A 0.1
# 2 1 NA A 0.2
# 3 1 NA A 0.3
# 4 4 C B NA
# 5 4 NA B NA

Merge data frames and divide rows by group

I would like to divide the values from df1 over the values from df2. In this reproducible example, I am able to sum these values. What about the division? Thanks in advance!
df1 <- data.frame(country = c("a", "b", "c"), year1 = c(1, 2, 3), year2 = c(1, 2, 3))
df2 <- data.frame(country = c("a", "b", "d"), year1 = c(1, 2, NA), year2 = c(1, 2, 3))
df3 <- bind_rows(df1, df2) %>%
mutate_if(is.numeric, tidyr::replace_na, 0) %>%
group_by(country) %>%
summarise_all(., sum, na.rm = TRUE) %>%
na_if(., 0)
Expected result is:
# A tibble: 4 x 3
country year1 year2
<chr> <dbl> <dbl>
1 a 1 1
2 b 1 1
3 c NA NA
4 d NA NA
As there are groups with 2 rows and some with 1, use an if/else condition within summarise/across to divide the first element by the last if there are two elements or else return NA
library(dplyr) # version 1.0.4
library(tidyr)
bind_rows(df1, df2) %>%
mutate(across(where(is.numeric), replace_na, 0)) %>%
group_by(country) %>%
summarise(across(everything(), ~ if(n() == 2) first(.)/last(.)
else NA_real_))
-output
# A tibble: 4 x 3
# country year1 year2
#* <chr> <dbl> <dbl>
#1 a 1 1
#2 b 1 1
#3 c NA NA
#4 d NA NA
Here is a base R option using merge + split.default
df <- merge(df1, df2, by = "country", all = TRUE)
cbind(
df[1],
list2DF(lapply(
split.default(df[-1], gsub("\\.(x|y)", "", names(df)[-1])),
function(v) do.call("/", v)
))
)
which gives
country year1 year2
1 a 1 1
2 b 1 1
3 c NA NA
4 d NA NA

R Count Values of every Column in Dataframe

I can do it using group by with only one Column but I want the Count for every Column in the Dataframe. See example:
Dataframe with similar values in each Column:
df =
C1 C2 C3
1 Positiv Negativ Neutral
2 NA Neutral Positiv
3 Positiv NA Negativ
4 Negativ Positiv NA
I thought about something like this:
df %>% group_by(names(df)) %>% count()
desired Output: Count Values in each Column
C1 C2 C3
1 Positive 2 1 1
2 Negative 1 1 1
3 Neutral 0 1 1
4 NA 1 1 1
Try this with some tidyverse functions and reshaping your data to long and wide:
library(dplyr)
library(tidyr)
#Code
new <- df %>% pivot_longer(everything()) %>%
group_by(across(everything())) %>%
summarise(N=n()) %>%
pivot_wider(names_from = name,values_from=N,values_fill=0)
Output:
# A tibble: 4 x 4
value C1 C2 C3
<chr> <int> <int> <int>
1 Negativ 1 1 1
2 Positiv 2 1 1
3 NA 1 1 1
4 Neutral 0 1 1
Some data used:
#Data
df <- structure(list(C1 = c("Positiv", NA, "Positiv", "Negativ"), C2 = c("Negativ",
"Neutral", NA, "Positiv"), C3 = c("Neutral", "Positiv", "Negativ",
NA)), class = "data.frame", row.names = c("1", "2", "3", "4"))
We can use base R with table
sapply(df, function(x) table(factor(x,
levels = c('Negativ', 'Positiv', 'Neutral')), useNA = "always"))
# C1 C2 C3
#Negativ 1 1 1
#Positiv 2 1 1
#Neutral 0 1 1
#<NA> 1 1 1
Or using a vectorized approach
table(unlist(df), c(col(df)), useNA = 'always')[, -4]
# 1 2 3
# Negativ 1 1 1
# Neutral 0 1 1
# Positiv 2 1 1
# <NA> 1 1 1
data
df <- structure(list(C1 = c("Positiv", NA, "Positiv", "Negativ"), C2 = c("Negativ",
"Neutral", NA, "Positiv"), C3 = c("Neutral", "Positiv", "Negativ",
NA)), class = "data.frame", row.names = c("1", "2", "3", "4"))

Merge multiple rows based on single column value

I am having following table:
A B C D E
1 NA we are here
1 hi we NA here
1 NA NA are there
2 u NA are where
I want my output table to be:
A B C D E
1 hi we are here
2 u NA are where
I have tried the following :
my_fun <- function(x) x[!is.na(x)]
buildingCopy %>%
group_by(A) %>%
summarise_all(funs(my_fun))
it gives error:
Error in summarise_impl(.data, dots) : Column E must be length 1
(a summary value), not 3
Can anyone help me to achieve required data frame.
You can modify your function in the following way:
my_fun <- function(x) {
if_else(any(!is.na(x)), na.exclude(x)[1], NA_character_)
}
First it checks whether there are any non-missing values and returns the first non-missing value and NA otherwise.
If you only use the funciton once you could also do:
buildingCopy %>%
group_by(A) %>%
summarise_all(funs(if_else(any(!is.na(.)), na.exclude(.)[1], NA_character_)))
# A tibble: 2 x 5
# A B C D E
# <dbl> <chr> <chr> <chr> <chr>
# 1 1 hi we are here
# 2 2 u NA are where
Or you can use the condition in a summarise_if-statement:
buildingCopy %>%
add_row(A = 2, B = "u", C = NA_character_, D = "are", E = "where") %>%
group_by(A) %>%
summarise_if(funs(any(!is.na(.))), funs(na.exclude(.)[1]))
# A tibble: 2 x 5
# A B C D E
# <dbl> <chr> <chr> <chr> <chr>
# 1 1 hi we are here
# 2 2 u NA are where
Data
buildingCopy <- structure(list(A = c(1L, 1L, 1L, 2L),
B = c(NA, "hi", NA, "u"),
C = c("we", "we", NA, NA),
D = c("are", NA, "are", "are"),
E = c("here", "here", "there", "where")),
class = "data.frame", row.names = c(NA, -4L))
The base R function na.omit() can be used here
library(dplyr)
my_fun <- function(x) na.omit(x) %>% first()
buildingCopy %>%
group_by(A) %>%
summarise_all(funs(my_fun))
# A tibble: 2 x 5
A B C D E
<int> <chr> <chr> <chr> <chr>
1 1 hi we are here
2 2 u NA are where
Data
buildingCopy <- readr::read_table(
"A B C D E
1 NA we are here
1 hi we NA here
1 NA NA are there
2 u NA are where")

Fill in missing data that is same across columns

I need to fill in some missing data from a merge that is the same in all columns. After the merge, all the values are NA, but i would like a quick way to fill them in since their values are the same.
Example:
df <- structure(list(date = structure(c(-25932, -25931, -25930, -25929,
-25928), class = "Date"), year = c(1899, 1899, 1899, 1899, 1899
), month = c(1, 1, 1, 1, 1), day = c(1, 2, 3, 4, 5), test1 = c(NA,
NA, "VAR1", NA, NA), test2 = c(NA, NA, "VAR2", NA, NA), test3 = c(NA,
NA, "VAR3", NA, NA)), .Names = c("date", "year", "month", "day",
"test1", "test2", "test3"), row.names = c(NA, 5L), class = "data.frame")
# Tedious way, but works
df$test1 <- "VAR1"
# Desired output
date year month day test1 test2 test3
1 1899-01-01 1899 1 1 VAR1 VAR2 VAR3
2 1899-01-02 1899 1 2 VAR1 VAR2 VAR3
3 1899-01-03 1899 1 3 VAR1 VAR2 VAR3
4 1899-01-04 1899 1 4 VAR1 VAR2 VAR3
5 1899-01-05 1899 1 5 VAR1 VAR2 VAR3
You can try something like the following:
df
# date year month day test1 test2 test3
# 1 1899-01-01 1899 1 1 <NA> <NA> <NA>
# 2 1899-01-02 1899 1 2 <NA> <NA> <NA>
# 3 1899-01-03 1899 1 3 VAR1 VAR2 VAR3
# 4 1899-01-04 1899 1 4 <NA> <NA> <NA>
# 5 1899-01-05 1899 1 5 <NA> <NA> <NA>
df[grep("test", names(df))] <- lapply(df[grep("test", names(df))],
function(x) x[!is.na(x)][1])
df
# date year month day test1 test2 test3
# 1 1899-01-01 1899 1 1 VAR1 VAR2 VAR3
# 2 1899-01-02 1899 1 2 VAR1 VAR2 VAR3
# 3 1899-01-03 1899 1 3 VAR1 VAR2 VAR3
# 4 1899-01-04 1899 1 4 VAR1 VAR2 VAR3
# 5 1899-01-05 1899 1 5 VAR1 VAR2 VAR3
We can also use data.table. Convert the 'data.frame' to 'data.table' (setDT(df)). Based on the index of 'test' columns ('nm1'), we loop with for and set the NA elements by the non-NA elements in each column.
library(data.table)
nm1 <- grep('^test', names(df))
setDT(df)
for(j in nm1){
set(df, i=which(is.na(df[[j]])), j=j, value= na.omit(df[[j]]))
}
df
# date year month day test1 test2 test3
#1: 1899-01-01 1899 1 1 VAR1 VAR2 VAR3
#2: 1899-01-02 1899 1 2 VAR1 VAR2 VAR3
#3: 1899-01-03 1899 1 3 VAR1 VAR2 VAR3
#4: 1899-01-04 1899 1 4 VAR1 VAR2 VAR3
#5: 1899-01-05 1899 1 5 VAR1 VAR2 VAR3

Resources