I am having following table:
A B C D E
1 NA we are here
1 hi we NA here
1 NA NA are there
2 u NA are where
I want my output table to be:
A B C D E
1 hi we are here
2 u NA are where
I have tried the following :
my_fun <- function(x) x[!is.na(x)]
buildingCopy %>%
group_by(A) %>%
summarise_all(funs(my_fun))
it gives error:
Error in summarise_impl(.data, dots) : Column E must be length 1
(a summary value), not 3
Can anyone help me to achieve required data frame.
You can modify your function in the following way:
my_fun <- function(x) {
if_else(any(!is.na(x)), na.exclude(x)[1], NA_character_)
}
First it checks whether there are any non-missing values and returns the first non-missing value and NA otherwise.
If you only use the funciton once you could also do:
buildingCopy %>%
group_by(A) %>%
summarise_all(funs(if_else(any(!is.na(.)), na.exclude(.)[1], NA_character_)))
# A tibble: 2 x 5
# A B C D E
# <dbl> <chr> <chr> <chr> <chr>
# 1 1 hi we are here
# 2 2 u NA are where
Or you can use the condition in a summarise_if-statement:
buildingCopy %>%
add_row(A = 2, B = "u", C = NA_character_, D = "are", E = "where") %>%
group_by(A) %>%
summarise_if(funs(any(!is.na(.))), funs(na.exclude(.)[1]))
# A tibble: 2 x 5
# A B C D E
# <dbl> <chr> <chr> <chr> <chr>
# 1 1 hi we are here
# 2 2 u NA are where
Data
buildingCopy <- structure(list(A = c(1L, 1L, 1L, 2L),
B = c(NA, "hi", NA, "u"),
C = c("we", "we", NA, NA),
D = c("are", NA, "are", "are"),
E = c("here", "here", "there", "where")),
class = "data.frame", row.names = c(NA, -4L))
The base R function na.omit() can be used here
library(dplyr)
my_fun <- function(x) na.omit(x) %>% first()
buildingCopy %>%
group_by(A) %>%
summarise_all(funs(my_fun))
# A tibble: 2 x 5
A B C D E
<int> <chr> <chr> <chr> <chr>
1 1 hi we are here
2 2 u NA are where
Data
buildingCopy <- readr::read_table(
"A B C D E
1 NA we are here
1 hi we NA here
1 NA NA are there
2 u NA are where")
Related
I have this type of data, where Sequis a grouping variable:
df <- data.frame(
Sequ = c(1,1,1,
2,2,2,
3,3,
4,4),
Answerer = c("A", NA, NA, "A", NA, NA, "B", NA, "C", NA),
PP_by = c(rep("A",5), rep("B",5)),
pp = c(0.1,0.2,0.3, 1, NA, NA, NA, NA, NA, NA)
)
I need to remove any Sequ where
(i) Answerer == PP_by AND
(ii) there is any NA in pp
I've tried this, but it obviously implements just the first condition (i):
library(dplyr)
df %>%
group_by(Sequ) %>%
filter(
all(!is.na(pp))
)
The expected result is:
Sequ Answerer PP_by pp
1 1 A A 0.1
2 1 <NA> A 0.2
3 1 <NA> A 0.3
9 4 C B NA
10 4 <NA> B NA
EDIT:
I've come up with this solution:
df %>%
group_by(Sequ) %>%
filter(
first(Answerer) != first(PP_by)
|
all(!is.na(pp))
)
Here's another way:
df %>%
group_by(Sequ) %>%
filter(!(
any(Answerer == PP_by, na.rm = TRUE) &
any(is.na(pp))
))
# # A tibble: 5 × 4
# # Groups: Sequ [2]
# Sequ Answerer PP_by pp
# <dbl> <chr> <chr> <dbl>
# 1 1 A A 0.1
# 2 1 NA A 0.2
# 3 1 NA A 0.3
# 4 4 C B NA
# 5 4 NA B NA
I was having the same problem as How to find last column with value (for each row) in R?, except I have rows with no value (entire row of NA). The sample provided in said post did not have an entire row of NAs.
I was wondering how I should modify the following? I do not want to remove those rows with all NAs because they will be useful in later analysis.
df %>%
rowwise %>%
mutate(m = {tmp <- c_across(starts_with('m'))
tail(na.omit(tmp), 1)}) %>%
ungroup
Thanks a lot in advance!
If all the elements in the rows are empty, then a general solution would be to create condition to return NA for those rows
library(dplyr)
df %>%
rowwise %>%
mutate(m = {tmp <- c_across(starts_with('m'))
if(all(is.na(tmp))) NA_character_ else
tail(na.omit(tmp), 1)}) %>%
ungroup
-output
# A tibble: 4 × 5
id m_1 m_2 m_3 m
<dbl> <chr> <chr> <chr> <chr>
1 1 a e i i
2 2 b <NA> <NA> b
3 3 <NA> <NA> <NA> <NA>
4 4 d h l l
If the OP wants to return only the last single non-NA element, we may also add an index [1] to extract, which automatically return NA when there are no elements
df %>%
rowwise %>%
mutate(m = {tmp <- c_across(starts_with('m'))
tail(na.omit(tmp), 1)[1]}) %>%
ungroup
# A tibble: 4 × 5
id m_1 m_2 m_3 m
<dbl> <chr> <chr> <chr> <chr>
1 1 a e i i
2 2 b <NA> <NA> b
3 3 <NA> <NA> <NA> <NA>
4 4 d h l l
data
df <- structure(list(id = c(1, 2, 3, 4), m_1 = c("a", "b", NA, "d"),
m_2 = c("e", NA, NA, "h"), m_3 = c("i", NA, NA, "l")), row.names = c(NA,
-4L), class = "data.frame")
Using data from #akrun (many thanks) we could do maybe this way:
'\\b[^,]+$' is a regular expression:
\\ ... means escape (in other words do not match) this is R special in other languages it is only one \
\\b... The metacharacter \b is an anchor like ^ and $ sign. It matches at a position that is called a “word boundary”. This match is zero-length.
[^,]+... stands for character class, here special with the ^caret: One character that is not ,. The + means here one or more ,
$ ... means end of string or end of line depending on multiline mode.
library(dplyr)
library(tidyr)
library(stringr)
df %>%
mutate(across(starts_with("m"), ~case_when(!is.na(.) ~ cur_column()), .names = 'new_{col}')) %>%
unite(New_Col, starts_with('new'), na.rm = TRUE, sep = ', ') %>%
mutate(New_Col = str_extract(New_Col, '\\b[^,]+$'))
id m_1 m_2 m_3 New_Col
1 1 a e i m_3
2 2 b <NA> <NA> m_1
3 3 <NA> <NA> <NA> <NA>
4 4 d h l m_3
library(tidyverse)
df <- data.frame(id = c(1, 2, 3, 4), m_1 = c("a", NA, "c", "d"), m_2 = c("e", NA, "g", "h"), m_3 = c("i", NA, NA, "l"))
df %>%
rowwise() %>%
mutate(
nms = list(str_subset(names(df), "^m")),
m = c_across(starts_with("m")) %>%
{
ifelse(test = all(is.na(.)),
yes = NA,
no = nms[which(. == tail(na.omit(.), 1))]
)
}
) %>%
select(-nms)
#> # A tibble: 4 × 5
#> # Rowwise:
#> id m_1 m_2 m_3 m
#> <dbl> <chr> <chr> <chr> <chr>
#> 1 1 a e i m_3
#> 2 2 <NA> <NA> <NA> <NA>
#> 3 3 c g <NA> m_2
#> 4 4 d h l m_3
# only the value no the column name
df %>%
rowwise() %>%
mutate(
m = c_across(starts_with("m")) %>%
{
ifelse(test = all(is.na(.)),
yes = NA,
no = tail(na.omit(.), 1)
)
}
)
#> # A tibble: 4 × 5
#> # Rowwise:
#> id m_1 m_2 m_3 m
#> <dbl> <chr> <chr> <chr> <chr>
#> 1 1 a e i i
#> 2 2 <NA> <NA> <NA> <NA>
#> 3 3 c g <NA> g
#> 4 4 d h l l
Created on 2022-01-01 by the reprex package (v2.0.1)
I am processing a large dataset adapted to my research. Suppose that I have 4 observations (records) and 5 columns as follows:
x <- data.frame("ID" = c(1, 2, 3, 4),
"group1" = c("A", NA, "B", NA),
"group2" = c("B", "A", NA, "C"),
"hours1" = c(3, NA, 5, NA),
"hours2" = c(1, 2, NA, 5))
> x
ID group1 group2 hours1 hours2
1 A B 3 1
2 <NA> A NA 2
3 B <NA> 5 NA
4 <NA> C NA 5
The "group1" and "group2" are reference columns containing the character values of A, B, and C, and the last two columns, "hours1" and "hours2," are numeric indicating hours obviously.
The column "group1" is corresponding to the column "hours1"; likewise, "group2" is corresponding to "hours 2."
I want to create multiple columns according to the values, A, B, and C, of the reference columns matching to values of "hours1" and "hours2" as follows:
ID group1 group2 hours1 hours2 A B C
1 A B 3 1 3 1 NA
2 <NA> A NA 2 2 NA NA
3 B <NA> 5 NA NA 5 NA
4 <NA> C NA 5 NA NA 5
For example, ID 1 has A in "group1," corresponding to 3 in "hours1" which is found under the column "A." ID 3 has B in "group1," corresponding to 5 in "hours1" which is found under the columns "B." In "group 2," ID 4 has C, corresponding to 5 in hours2 which is found under column "C."
Is there a way to do it using R?
One way would be to combine all the "hour" column in one column and "group" columns in another column. This can be done using pivot_longer. After that we can get data in wide format and join it with original data.
library(dplyr)
library(tidyr)
x %>%
pivot_longer(cols = -ID,
names_to = c('.value'),
names_pattern = '(.*?)\\d+',
values_drop_na = TRUE) %>%
pivot_wider(names_from = group, values_from = hours) %>%
left_join(x, by = 'ID') %>%
select(ID, starts_with('group'), starts_with('hour'), everything())
# A tibble: 4 x 8
# ID group1 group2 hours1 hours2 A B C
# <dbl> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 1 A B 3 1 3 1 NA
#2 2 NA A NA 2 2 NA NA
#3 3 B NA 5 NA NA 5 NA
#4 4 NA C NA 5 NA NA 5
For OP's dataset we can slightly modify the code to achieve the desired result.
zz %>%
pivot_longer(cols = -id,
names_to = c('.value'),
names_pattern = '(.*)_',
values_drop_na = TRUE) %>%
arrange(fu2a) %>%
pivot_wider(names_from = fu2a, values_from = fu2b) %>%
left_join(zz, by = 'id') %>%
select(id, starts_with('fu2a'), starts_with('fu2b'), everything())
Another approach using dplyr could be done separating group and hours variables to compute the desired variables and then merge with the original x:
library(tidyverse)
#Data
x <- data.frame("ID" = c(1, 2, 3, 4),
"group1" = c("A", NA, "B", NA),
"group2" = c("B", "A", NA, "C"),
"hours1" = c(3, NA, 5, NA),
"hours2" = c(1, 2, NA, 5),stringsAsFactors = F)
#Reshape
x %>%
left_join(x %>% select(1:3) %>%
pivot_longer(cols = -ID) %>%
group_by(ID) %>% mutate(id=1:n()) %>%
left_join(x %>% select(c(1,4:5)) %>%
pivot_longer(cols = -ID) %>%
rename(name2=name,value2=value) %>%
group_by(ID) %>% mutate(id=1:n())) %>%
filter(!is.na(value)) %>% select(ID,value,value2) %>%
pivot_wider(names_from = value,values_from=value2))
Output:
ID group1 group2 hours1 hours2 A B C
1 1 A B 3 1 3 1 NA
2 2 <NA> A NA 2 2 NA NA
3 3 B <NA> 5 NA NA 5 NA
4 4 <NA> C NA 5 NA NA 5
I have a dataframe of the following type
ID case1 case2 case3 case4
1 A B C D
2 B A
3 E F
4 G C A
5 T
I need to change its format, to a long shape, similar as the below:
ID col1 col2
1 A B
1 A C
1 A D
1 B C
1 B D
1 C D
2 B A
3 E F
4 G C
4 G A
4 C A
5 T
As you can see, I need to maintain the ID and ignore empty columns. There are some cases like T that need to remain in the dataset, but without a col2.
I am honestly not sure how to approach this, so that is why there are no examples of what I have tried.
You can get the data in long format and create all combination of values for each ID if the number of rows is greater than 1 in that ID.
library(dplyr)
library(tidyr)
df %>%
pivot_longer(cols = -ID, values_drop_na = TRUE) %>%
group_by(ID) %>%
summarise(value = if(n() > 1) list(setNames(as.data.frame(t(combn(value, 2))),
c('col1', 'col2')))
else list(data.frame(col1 = value[1], col2 = NA_character_))) %>%
unnest(value)
# A tibble: 12 x 3
# ID col1 col2
# <int> <chr> <chr>
# 1 1 A B
# 2 1 A C
# 3 1 A D
# 4 1 B C
# 5 1 B D
# 6 1 C D
# 7 2 B A
# 8 3 E F
# 9 4 G C
#10 4 G A
#11 4 C A
#12 5 T NA
data
df <- structure(list(ID = 1:5, case1 = c("A", "B", "E", "G", "T"),
case2 = c("B", "A", "F", "C", NA), case3 = c("C", NA, NA,
"A", NA), case4 = c("D", NA, NA, NA, NA)),
class = "data.frame", row.names = c(NA, -5L))
I have a data frame like this
df <- data.frame(id = 1:4,
V1 = c("A", NA, "C", NA),
V2 = c(NA, NA, NA, "E"),
V3 = c(NA, "B", NA, "F"),
V4 = c(NA, NA, "D", NA), stringsAsFactors = F)
# id V1 V2 V3 V4
# 1 1 A <NA> <NA> <NA>
# 2 2 <NA> <NA> B <NA>
# 3 3 C <NA> <NA> D
# 4 4 <NA> E F <NA>
How can I extract non-missing elements by rows and stack them into a column? My expected output is:
# id value
# 1 1 A
# 2 2 B
# 3 3 C
# 4 3 D
# 5 4 E
# 6 4 F
Try pivot_longer() or unite() + separate_rows().
library(tidyr)
library(dplyr)
# Method 1
df %>%
pivot_longer(-id, values_drop_na = T) %>%
select(-name)
# Method 2
df %>%
unite(value, -id, na.rm = T) %>%
separate_rows(value)
# # A tibble: 6 x 2
# id value
# <int> <chr>
# 1 1 A
# 2 2 B
# 3 3 C
# 4 3 D
# 5 4 E
# 6 4 F
You can use dplyr and tidyr:
df %>%
tidyr::gather(-id, key = "key", value = "value") %>%
dplyr::filter(!is.na(value))
id key value
1 1 V1 A
2 3 V1 C
3 4 V2 E
4 2 V3 B
5 4 V3 F
6 3 V4 D
One base R solution could be:
na.omit(data.frame(df[1], stack(df[-1])[1]))
id values
1 1 A
3 3 C
8 4 E
10 2 B
12 4 F
15 3 D
How about combining complete.cases with reshape library?
library(reshape2)
df.temp <- melt(df, id.vars = "id")
df.temp[complete.cases(df.temp),-2]
results in
id value
1 1 A
3 3 C
8 4 E
10 2 B
12 4 F
15 3 D
pivot_longer then filter
library(tidyverse)
df <- data.frame(id = 1:4,
V1 = c("A", NA, "C", NA),
V2 = c(NA, NA, NA, "E"),
V3 = c(NA, "B", NA, "F"),
V4 = c(NA, NA, "D", NA), stringsAsFactors = FALSE)
df %>% pivot_longer(-id, names_to = "name", values_to = "value") %>%
filter(!is.na(value)) %>%
select(-name)
#> # A tibble: 6 x 2
#> id value
#> <int> <chr>
#> 1 1 A
#> 2 2 B
#> 3 3 C
#> 4 3 D
#> 5 4 E
#> 6 4 F
Created on 2020-03-02 by the reprex package (v0.3.0)