My data.frame df1 looks like this:
A NA NA B NA NA C NA NA
1 2 3 4 5 6 7 8 9
I want it to look like this:
A 1
A 2
A 3
B 4
B 5
B 6
C 7
C 8
C 9
Any ideas?
Suppose you have a data.frame like this:
df <- data.frame(matrix(c("A", NA, NA, "B", NA, NA, "C", NA, NA, 1:9), byrow = TRUE, nrow = 2))
> df
X1 X2 X3 X4 X5 X6 X7 X8 X9
1 A <NA> <NA> B <NA> <NA> C <NA> <NA>
2 1 2 3 4 5 6 7 8 9
Using tidyr:
df %>%
t() %>%
as.data.frame() %>%
fill(V1, .direction = "down")
gives you
V1 V2
1 A 1
2 A 2
3 A 3
4 B 4
5 B 5
6 B 6
7 C 7
8 C 8
9 C 9
An option with na.locf from zoo
library(zoo)
na.locf(as.data.frame(t(df)))
# V1 V2
#X1 A 1
#X2 A 2
#X3 A 3
#X4 B 4
#X5 B 5
#X6 B 6
#X7 C 7
#X8 C 8
#X9 C 9
data
df <- data.frame(matrix(c("A", NA, NA, "B", NA, NA, "C", NA, NA, 1:9),
byrow = TRUE, nrow = 2))
Related
I have the following datasets in RStudio:
df =
a b
1 A
1 NA
1 A
1 NA
2 C
2 NA
2 B
3 A
3 NA
3 C
3 D
and fill_with =
a b
1 A
2 B
3 C
How do I fill the NA values in df in the b column according to the a column?
Ex: a=1, b=NA, then I look at the table fill_with at a=1, and I see that I should fill it with b=A.
In the end it should look the following way:
df =
a b
1 A
1 A
1 A
1 A
2 C
2 B
2 B
3 A
3 C
3 C
3 D
We can use ifelse
df$b <- ifelse(is.na(df$b) ,
fill_with$b[match(df$a , fill_with$a)] , df$b)
Output
a b
1 1 A
2 1 A
3 1 A
4 1 A
5 2 C
6 2 B
7 2 B
8 3 A
9 3 C
10 3 C
11 3 D
library(tidyverse)
df <- read_table("a b
1 A
1 NA
1 A
1 NA
2 C
2 NA
2 B
3 A
3 NA
3 C
3 D")
df %>%
group_by(a) %>%
fill(b, .direction = "updown")
# A tibble: 11 x 2
# Groups: a [3]
a b
<dbl> <chr>
1 1 A
2 1 A
3 1 A
4 1 A
5 2 C
6 2 B
7 2 B
8 3 A
9 3 C
10 3 C
11 3 D
Base R
tmp=which(is.na(df$b))
df$b[tmp]=fill_with$b[match(df$a,fill_with$a)[tmp]]
a b
1 1 A
2 1 A
3 1 A
4 1 A
5 2 C
6 2 B
7 2 B
8 3 A
9 3 C
10 3 C
11 3 D
library(tidyverse)
df <- data.frame(
stringsAsFactors = FALSE,
a = c(1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3),
b = c("A", NA, "A", NA, "C", NA, "B", "A", NA, "C", "D")
)
fill_with <- data.frame(
stringsAsFactors = FALSE,
a = c(1L, 2L, 3L),
b = c("A", "B", "C")
)
rows_update(x = df, y = fill_with, by = "a")
#> a b
#> 1 1 A
#> 2 1 A
#> 3 1 A
#> 4 1 A
#> 5 2 B
#> 6 2 B
#> 7 2 B
#> 8 3 C
#> 9 3 C
#> 10 3 C
#> 11 3 C
Created on 2022-08-22 with reprex v2.0.2
I am counting row-based Na values according to col1 variable in the data set. I want to add a condition to this query:
When calculating the number of NA,
For col2 = a and b, also look at col4 column, for col2 = c, do not look at col4 column
# creating a dataframe
data_frame <- data.frame(col1 = sample(6:9, 9 , replace = TRUE),
col2 = letters[1:3],
col3 = c(1,NA,NA,1,NA,NA,2,NA,2),
col4 = c(1,4,NA,1,NA,NA,NA,1,2))
data_frame = data_frame %>%
rowwise() %>%
mutate(Count_NA = sum(is.na(cur_data()))) %>%
ungroup
#print (data_frame)
data_frame %>% group_by(col1) %>%
summarize(Sum_Count_NA=sum(Count_NA))
The output I want is;
col1
col2
col3
col4
Count_NA
8
a
1
1
0
6
b
NA
4
1
8
c
NA
NA
2
7
a
1
1
0
8
b
NA
NA
2
8
c
NA
NA
2
8
a
2
NA
1
8
b
NA
1
1
9
c
2
2
0
After adding the condition, the output I want is;
Counting Na in col4 for col2 = c
col1
col2
col3
col4
Count_NA
8
a
1
1
0
6
b
NA
4
1
8
c
NA
NA
1
7
a
1
1
0
8
b
NA
NA
2
8
c
NA
NA
1
8
a
2
NA
1
8
b
NA
1
1
9
c
2
2
0
An option is also to replace the NA elements in the 'col4' with non-NA when 'col2' is 'c' and then do the rowSums on the logical matrix
library(dplyr)
data_frame %>%
mutate(Count_Na = rowSums(is.na(cbind(col3, replace(col4, col2 == 'c', 999)))))
-output
col1 col2 col3 col4 Count_Na
1 7 a 1 1 0
2 9 b NA 4 1
3 9 c NA NA 1
4 7 a 1 1 0
5 7 b NA NA 2
6 7 c NA NA 1
7 7 a 2 NA 1
8 9 b NA 1 1
9 7 c 2 2 0
You can do this:
library(dplyr)
data_frame %>%
mutate(sum = rowSums(is.na(select(., contains("col3")))) + (col2 == "c" & is.na(col4)))
col1 col2 col3 col4 sum
1 8 a 1 1 0
2 6 b NA 4 1
3 9 c NA NA 2
4 8 a 1 1 0
5 7 b NA NA 1
6 7 c NA NA 2
7 7 a 2 NA 0
8 9 b NA 1 1
9 7 c 2 2 0
data
data_frame <- structure(list(col1 = c(8L, 6L, 9L, 8L, 7L, 7L, 7L, 9L, 7L),
col2 = c("a", "b", "c", "a", "b", "c", "a", "b", "c"), col3 = c(1,
NA, NA, 1, NA, NA, 2, NA, 2), col4 = c(1, 4, NA, 1, NA, NA,
NA, 1, 2)), class = "data.frame", row.names = c(NA, -9L))
This must be easy but my brain is blocked!
I have this dataframe:
col1
<chr>
1 A
2 B
3 NA
4 C
5 D
6 NA
7 NA
8 E
9 NA
10 F
df <- structure(list(col1 = c("A", "B", NA, "C", "D", NA, NA, "E",
NA, "F")), row.names = c(NA, -10L), class = c("tbl_df", "tbl",
"data.frame"))
I want to add a column with uniqueID only for values that are not NA with tidyverse.
Expected output:
col1 uniqueID
<chr> <dbl>
1 A 1
2 B 2
3 NA NA
4 C 3
5 D 4
6 NA NA
7 NA NA
8 E 5
9 NA NA
10 F 6
I have tried: n(), row_number(), cur_group_id ....
We could do this easily in data.table. Specify the condition in i i.e. non-NA elements in 'col1', create the column 'uniqueID' with the sequence of elements by assignment (:=)
library(data.table)
setDT(df)[!is.na(col1), uniqueID := seq_len(.N)]
-output
df
col1 uniqueID
1: A 1
2: B 2
3: <NA> NA
4: C 3
5: D 4
6: <NA> NA
7: <NA> NA
8: E 5
9: <NA> NA
10: F 6
In dplyr, we can use replace
library(dplyr)
df %>%
mutate(uniqueID = replace(col1, !is.na(col1),
seq_len(sum(!is.na(col1)))))
-output
# A tibble: 10 x 2
col1 uniqueID
<chr> <chr>
1 A 1
2 B 2
3 <NA> <NA>
4 C 3
5 D 4
6 <NA> <NA>
7 <NA> <NA>
8 E 5
9 <NA> <NA>
10 F 6
Another approach:
library(dplyr)
df %>%
mutate(UniqueID = cumsum(!is.na(col1)),
UniqueID = if_else(is.na(col1), NA_integer_, UniqueID))
# A tibble: 10 x 2
col1 UniqueID
<chr> <int>
1 A 1
2 B 2
3 NA NA
4 C 3
5 D 4
6 NA NA
7 NA NA
8 E 5
9 NA NA
10 F 6
A base R option using match + na.omit + unique
transform(
df,
uniqueID = match(col1, na.omit(unique(col1)))
)
gives
col1 uniqueID
1 A 1
2 B 2
3 <NA> NA
4 C 3
5 D 4
6 <NA> NA
7 <NA> NA
8 E 5
9 <NA> NA
10 F 6
A weird tidyverse solution:
library(dplyr)
df %>%
mutate(id = ifelse(is.na(col1), 0, 1),
id = cumsum(id == 1),
id = ifelse(is.na(col1), NA, id))
# A tibble: 10 x 2
col1 id
<chr> <int>
1 A 1
2 B 2
3 NA NA
4 C 3
5 D 4
6 NA NA
7 NA NA
8 E 5
9 NA NA
10 F 6
I am attempting to merge 2 datasets belonging to a single id with a larger dataset.
However, I am having trouble merging the two single row datasets into a single row within the larger dataset.
Is there a simple way to merge with dplyr and only overwrite values if they are NA's?
My data:
df1 <- data.frame(id=1:5, b=6:10, c=c("a", "b", "c", "d", "e"), d=c(NA, 1,2,3, 4))
df2 <- data.frame(id=6, b=2, c="f", d=NA_real_)
df3 <- data.frame(id=6, b=NA_real_, c=NA_character_, d=5, e="a")
> df1
id b c d
1 1 6 a NA
2 2 7 b 1
3 3 8 c 2
4 4 9 d 3
5 5 10 e 4
> df2
id b c d
1 6 2 f NA
> df3
id b c d e
1 6 NA <NA> 5 a
My attempt:
merge1 <- dplyr::full_join(df1, df2) %>% full_join(df3)
Desired output:
output <- data.frame(id=1:6, b=c(6:10,2), c=c("a", "b", "c", "d", "e", "f"), d=c(NA, 1,2,3, 4, 5), e=c(NA,NA, NA, NA, NA, "a"))
> output
id b c d e
1 1 6 a NA <NA>
2 2 7 b 1 <NA>
3 3 8 c 2 <NA>
4 4 9 d 3 <NA>
5 5 10 e 4 <NA>
6 6 2 f 5 a
As opposed to:
id b c d e
1 1 6 a NA <NA>
2 2 7 b 1 <NA>
3 3 8 c 2 <NA>
4 4 9 d 3 <NA>
5 5 10 e 4 <NA>
6 6 2 f NA <NA>
7 6 NA <NA> 5 a
Thank you
You can try:
list(df1, df2, df3) %>%
bind_rows() %>%
group_by(id) %>%
summarise_all(~ first(na.omit(.)))
id b c d e
<dbl> <dbl> <chr> <dbl> <fct>
1 1 6 a NA <NA>
2 2 7 b 1 <NA>
3 3 8 c 2 <NA>
4 4 9 d 3 <NA>
5 5 10 e 4 <NA>
6 6 2 f 5 a
you can try
library(tidyverse)
df1 %>%
mutate_if(is.factor, as.character) %>%
bind_rows(mutate_if(df2, is.factor, as.character)) %>%
left_join(select(df3, id, d, e), by = "id") %>%
mutate(d= ifelse(is.na(d.x), d.y, d.x)) %>%
select(-d.x, -d.y)
I would like to convert repeating values in a vector into NA's, such that I keep the position of the first occurrence of each new value.
I can find lots of posts on how to solve the removal of duplicate rows, but no posts that solve this issue.
Can you help me convert the column "problem" into the values in the column "desire"?
dplyr solutions are preferred.
library(tidyverse)
df <- tribble(
~frame, ~problem, ~desire,
1, NA, NA,
2, "A", "A",
3, NA, NA,
4, "B", "B",
5, "B", NA,
6, NA, NA,
7, "C", "C",
8, "C", NA,
9, NA, NA,
10, "E", "E")
df
# A tibble: 10 x 3
frame problem desire
<dbl> <chr> <chr>
1 1 NA NA
2 2 A A
3 3 NA NA
4 4 B B
5 5 B NA
6 6 NA NA
7 7 C C
8 8 C NA
9 9 NA NA
10 10 E E
_____EDIT with "Base R"/ "dplyr" solution___
Ronak Shah's solution works. Here it is within a dplyr workflow in case anyone is interested:
df %>%
mutate(
solved = replace(problem, duplicated(problem), NA))
# A tibble: 10 x 4
frame problem desire solved
<dbl> <chr> <chr> <chr>
1 1 NA NA NA
2 2 A A A
3 3 NA NA NA
4 4 B B B
5 5 B NA NA
6 6 NA NA NA
7 7 C C C
8 8 C NA NA
9 9 NA NA NA
10 10 E E E
Using data.table rleid, we can replace the duplicated values to NA.
library(data.table)
df$answer <- replace(df$problem, duplicated(rleid(df$problem)), NA)
# frame problem desire answer
# <dbl> <chr> <chr> <chr>
# 1 1 NA NA NA
# 2 2 A A A
# 3 3 NA NA NA
# 4 4 B B B
# 5 5 B NA NA
# 6 6 NA NA NA
# 7 7 C C C
# 8 8 C NA NA
# 9 9 NA NA NA
#10 10 E E E
For a complete base R option we can use rle instead of rleid to create sequence
df$answer <- replace(df$problem, duplicated(with(rle(df$problem),
rep(seq_along(values), lengths))), NA)
As in the example shown if all the similar values are always together we can use only duplicated
df$problem <- replace(df$problem, duplicated(df$problem), NA)
We can use data.table
library(data.table)
setDT(df)[duplicated(rleid(problem)), problem := NA][]