Ok so my dataframe looks like this let's call if df
KEY A1 A2 A3 A4 B1 B2 B3 B4 C1 C2 C3 C4
1 120 100 NA 110 1 1 NA 1 NA NA NA NA
2 100 NA 115 NA NA NA NA NA Y N Y N
So what I'm trying to do is make it so that when an A columns has a value of 100 and the corresponding B or C column has a value of 1 or "Y" respectively that makes a new column with a X with a value of 1. In Row 1 that would be A2 and B2 and in row that would be A1 and C1.
I tried doing three sets of gather and then using the mutate function using case_when. like so
df<- df %>%
gather(key="A",value="code",dx)%>%
gather(key="B",value="number",dxadm)%>%
gather(key="C",value="character",dxpoa) %>%
mutate(X=case_when(
code == 100 & present >0 ~ 1,
code ==100 & character == "Y"~1)
)
Except my spread function of these rows came back with rows all array and my X out of place.
Alternatively, I considered something like
df <- df %>%
mutate(X=case_when(
A1 == 100 & B1 >0 ~ 1,
A1 ==100 & C1 == "Y"~1,
A2 == 100 & B2 >0 ~ 1,
A2 ==100 & C2 == "Y"~1,)
and so on for all permutations. The two problems with this are that I have a lot of columns and I'd like to this for multiple different values of A.
Can anyone recommend an alternative or at least a way to make the second solution into something that would only require one annoying long piece of code that I could make into a more generalizable function? Thanks!
A suggestion
require(read.so) #awesome package to read from Stackoverflow,
# available on GitHub [https://alistaire47.github.io/read.so/][1]
require(tidyr)
require(reshape2)
require(dplyr)
dat <- read.so()
dat %>% gather(var, value, 2:13) %>% #make it long
mutate(var = gsub('([A-Z])', '\\1_', .[['var']])) %>% #add underscore
separate(var, c('var', 'number') ) %>% #separate your column
dcast(KEY+number ~ var) %>% #dcast is a bit complex but quite powerful
group_by(KEY) %>%
filter(A == 100)
# A tibble: 2 x 5
# Groups: KEY [2]
KEY number A B C
<int> <chr> <chr> <chr> <chr>
1 1 2 100 1 <NA>
2 2 1 100 <NA> Y
A solution using dplyr and tidyr. We can gather all the columns except KEY, separate the letters and numbers, and then spread the letter so that we can create the X column without specifying the numbers. Notice that I assume if the condition is not met, X would be 0, and based on your description, I used any(A %in% 100 & (B %in% 1 | C %in% "Y")) to test the condition as any given numbers met the condition, X would be 1.
library(dplyr)
library(tidyr)
df2 <- df %>%
gather(Column, Value, -KEY) %>%
separate(Column, into = c("Letter", "Number"), sep = 1) %>%
spread(Letter, Value, convert = TRUE) %>%
group_by(KEY) %>%
mutate(X = ifelse(any(A %in% 100 & (B %in% 1 | C %in% "Y")), 1L, 0L))
df2 %>% as.data.frame()
# KEY Number A B C X
# 1 1 1 120 1 <NA> 1
# 2 1 2 100 1 <NA> 1
# 3 1 3 NA NA <NA> 1
# 4 1 4 110 1 <NA> 1
# 5 2 1 100 NA Y 1
# 6 2 2 NA NA N 1
# 7 2 3 115 NA Y 1
# 8 2 4 NA NA N 1
I think the structure of df2 is good, but if you really want the original structure, we can do the following.
df3 <- df2 %>%
gather(Letter, Value, A:C) %>%
unite(Column, Letter, Number, sep = "") %>%
spread(Column, Value) %>%
select(names(df), X)
df3 %>% as.data.frame()
# KEY A1 A2 A3 A4 B1 B2 B3 B4 C1 C2 C3 C4 X
# 1 1 120 100 <NA> 110 1 1 <NA> 1 <NA> <NA> <NA> <NA> 1
# 2 2 100 <NA> 115 <NA> <NA> <NA> <NA> <NA> Y N Y N 1
df3 is the final output.
DATA
df <- read.table(text = "KEY A1 A2 A3 A4 B1 B2 B3 B4 C1 C2 C3 C4
1 120 100 NA 110 1 1 NA 1 NA NA NA NA
2 100 NA 115 NA NA NA NA NA Y N Y N",
header = TRUE, stringsAsFactors = FALSE)
Same idea as Tjebo, but sticking to the tidyverse....
library(tidyverse)
dat <- data.frame(stringsAsFactors=FALSE,
KEY = c(1L, 2L),
A1 = c(120L, 100L),
A2 = c(100L, NA),
A3 = c(NA, 115L),
A4 = c(110L, NA),
B1 = c(1L, NA),
B2 = c(1L, NA),
B3 = c(NA, NA),
B4 = c(1L, NA),
C1 = c(NA, "Y"),
C2 = c(NA, "N"),
C3 = c(NA, "Y"),
C4 = c(NA, "N"))
dat %>%
gather(var, value, -KEY) %>% #make it long
extract(var, regex = "(.)(.)", into = c("var", "number") ) %>%
spread(var, value) %>%
filter( A %in% 100 )
#> KEY number A B C
#> 1 1 2 100 1 <NA>
#> 2 2 1 100 <NA> Y
Created on 2018-02-27 by the reprex package (v0.2.0).
Related
I have the following dataframe in R
library(dplyr)
library(tidyr)
df= data.frame("ID"= c("A", "A", "A", "A", "B", "B", "B"),
"A1"= c(0,0, 0, 0, 1,0,1), "B1"= c(1,0, 1,0, 0, 0, 0))
The dataframe appears as follows
ID A1 B1
1 A 0 1
2 A 0 0
3 A 0 1
4 A 0 0
5 B 1 0
6 B 0 0
7 B 1 0
I would like to obtain the following dataframe
ID A1 B1
1 A NA 1
2 A NA 0
3 A NA 1
4 A NA 0
5 B 1 NA
6 B 0 NA
7 B 1 NA
I have tried the following code
df%>%group_by(ID)%>%
mutate(A1=case_when(sum(A1)==0~NA))%>%
mutate(B1=case_when(sum(B1)==0~NA))
This converts A1 and B1 completely to NA values.
I request some help here.
We can group_by ID and use mutate_all with replace
library(dplyr)
df %>%
group_by(ID) %>%
mutate_all(~replace(., all(. == 0), NA))
# ID A1 B1
# <fct> <dbl> <dbl>
#1 A NA 1
#2 A NA 0
#3 A NA 1
#4 A NA 0
#5 B 1 NA
#6 B 0 NA
#7 B 1 NA
If there are other columns and we want to apply this only to specific columns we can use mutate_at
df %>%
group_by(ID) %>%
mutate_at(vars(A1,B1), ~replace(., all(. == 0), NA))
Using case_when we can do this as
df %>%
group_by(ID) %>%
mutate_all(~case_when(all(. == 0) ~ NA_real_, TRUE ~ .))
The problem in OP's attempt was there was no TRUE case defined in case_when so when no condition is matched it returns NA by default. From ?case_when
If no cases match, NA is returned.
So if we define the TRUE case it would work as expected. Also we should not check for sum(A1)==0 because if there are negative and positive values in the column (like -2 , +2) they would add up to 0 giving unexpected results.
df%>%
group_by(ID) %>%
mutate(A1 = case_when(all(A1 == 0) ~ NA_real_, TRUE ~ A1),
B1 = case_when(all(B1 == 0) ~ NA_real_, TRUE ~ B1))
With tidyverse, we can use if/else
library(tidyverse)
df %>%
group_by(ID) %>%
mutate_all(list(~ if(all(.==0)) NA_integer_ else .))
# ID A1 B1
# <fct> <dbl> <dbl>
#1 A NA 1
#2 A NA 0
#3 A NA 1
#4 A NA 0
#5 B 1 NA
#6 B 0 NA
#7 B 1 NA
Or without any if/else
df %>%
group_by(ID) %>%
mutate_all(~ NA^all(!.) * .)
or using data.table
library(data.table)
setDT(df)[, lapply(.SD, function(x) replace(x, all(x == 0), NA)), ID]
Or using base R
by(df[-1], df$ID, FUN = function(x) x * (NA^ !colSums(!!x))[col(x)])
I have the following data frame,
c1 <- c(1,2,"<NA>","<NA>")
c2 <- c("<NA>","<NA>",3,4)
df <- data.frame(c1,c2)
>df
c1 c2
1 1 <NA>
2 2 <NA>
3 <NA> 3
4 <NA> 4
The following is the desired output that I'm trying to obtain after merging columns 1 ,2
>df
c1
1 1
2 2
3 3
4 4
I tried,
df <- mutate(df, x =paste(c1,c2))
which gives
> df
c1 c2 x
1 1 <NA> 1 <NA>
2 2 <NA> 2 <NA>
3 <NA> 3 <NA> 3
4 <NA> 4 <NA> 4
Could someone give suggestions on how to obtain the desired output?
One way is this:
c1 <- c(1, 2, NA, NA)
c2 <- c(NA, NA, 3, 4)
df <- data.frame(c1, c2)
df2 <- data.frame(
c1 = ifelse(is.na(df$c1), df$c2, df$c1)
)
#df2
# c1
#1 1
#2 2
#3 3
#4 4
You are close, but you are pasting together two strings where one uses the string NA in angled brackets to represent nothing, and if you are pasting strings together and want a string to not appear in the pasted string you need to have it as a zero length string. You can do this using the recode command in dplyr.
You can modify your code to be:
library(dplyr)
df <- mutate(df, x =paste0(recode(c1,"<NA>" = ""),recode(c2,"<NA>" = "")))
Another way using dplyr from tidyverse:
df2 <- df %>%
mutate(c3 = if_else(is.na(c1),c2,c1)) %>%
select(-c1, -c2) %>% # Given you only wanted one column
rename(c1 = c3) # Given you wanted the column to be called c1
Output:
c1
1 1
2 2
3 3
4 4
You could use rowSums :
data.frame(c1 = rowSums(df,na.rm = TRUE))
# c1
# 1 1
# 2 2
# 3 3
# 4 4
Since it seems the the dataframe actually contains NA values rather than '<NA>' strings, I would suggest to use coalesce:
c1 <- c(1,2,NA, NA)
c2 <- c(NA, NA,3,4)
df <- data.frame(c1,c2)
library(tidyverse)
df %>%
mutate(c3=coalesce(c1, c2))
Output:
c1 c2 c3
1 1 NA 1
2 2 NA 2
3 NA 3 3
4 NA 4 4
I would like to reshape my data in a way that district values in one column, related to another column, are displayed in newly created columns
df
A B
1 <NA> <NA>
2 a b
3 a d
4 b c
similar to :
> df %>%
+ group_by(A) %>%
+ summarise(n_distinct(B))
# A tibble: 3 x 2
A `n_distinct(B)`
<chr> <int>
1 a 2
2 b 1
3 NA 1
But instead of counting the occurrences, just display the actual values in a new column?
something like the below:
df
A B
1 <NA> <NA>
2 a b **d**
4 b c
I tried spreading, but It is not working, the below error comes up:
Error: Duplicate identifiers for rows
both my columns are factors, but can be reclassified if need be .
Thank you!
library(dplyr)
library(tidyr)
df %>% group_by(A) %>% summarise(B=paste0(unique(B), collapse = ',')) %>%
separate(B,into = paste0('B',1:2))
# A tibble: 3 x 3
A B1 B2
<chr> <chr> <chr>
1 a b d
2 b c NA
3 NA NA NA
Warning message:
Expected 2 pieces. Missing pieces filled with `NA` in 2 rows [2, 3].
Here is an option using spread after creating a sequence column
library(tidyverse)
df %>%
group_by(A) %>%
mutate(n1 = paste0("B", row_number())) %>%
ungroup %>%
spread(n1, B)
# A tibble: 3 x 3
# A B1 B2
# <fct> <fct> <fct>
#1 a b d
#2 b c <NA>
#3 <NA> <NA> <NA>
data
df <- data.frame(A = c(NA, 'a', 'a', 'b'), B = c(NA, 'b', 'd', 'c'))
In my data frame I'm trying to count certain text '000', 'xxx' and not (000|xxx)
My dataframe is this:
Name per1 per2 per3
a1 000 xxx 230
a1 xxx 000 NA
a2 000 340 xxx
a3 000 xxx NA
Desired result count:
000 xxx Others
a1 2 2 1
a2 1 1 1
a3 1 1 0
Using dplyr: I tried but going wrong, please help on how to attain this
df %>% groupby(Name) %>% filter(grepl('000')) %>% summarize(000 = n())
An option is to convert data to long format and then use reshape2::dcast to get count as:
library(tidyverse)
library(reshape2)
df %>% gather(key, value, -Name) %>%
mutate(value = ifelse(is.na(value), "Others", value)) %>%
dcast(Name~value, fun.aggregate = length)
# Name 000 230 340 Others xxx
# 1 a1 2 1 0 1 2
# 2 a2 1 0 1 0 1
# 3 a3 1 0 0 1 1
OR: If OP is interested in having count for 000, xxx and Others categories then:
library(tidyverse)
library(reshape2)
df %>% gather(key, value, -Name) %>%
mutate(value =
ifelse(is.na(value) | !(value %in% c("000", "xxx")), "Others", value)) %>%
dcast(Name~value, fun.aggregate = length)
# Name 000 Others xxx
# 1 a1 2 2 2
# 2 a2 1 1 1
# 3 a3 1 1 1
Data:
df<-read.table(text="
Name per1 per2 per3
a1 000 xxx 230
a1 xxx 000 NA
a2 000 340 xxx
a3 000 xxx NA",
header=TRUE, stringsAsFactor = FALSE)
Here are a few tidyverse possibilities, all variations on the same idea:
library(tidyverse)
df %>%
nest(-Name) %>%
rowwise %>%
summarize(`000` = sum(data =='000',na.rm=T),
xxx = sum(data =='xxx',na.rm=T),
Others = sum(!is.na(data))-`000` - xxx)
df %>%
nest(-Name) %>%
group_by(Name) %>%
summarize(`000` = sum(data[[1]]=='000',na.rm=T),
xxx = sum(data[[1]]=='xxx',na.rm=T),
Others = sum(!is.na(data[[1]]))-`000` - xxx)
df %>%
group_by(Name) %>%
do(tibble(`000` = sum(.[-1]=='000',na.rm=T),
xxx = sum(.[-1]=='xxx',na.rm=T),
Others = sum(!is.na(.[-1]))-`000` - xxx)) %>%
ungroup
# # A tibble: 3 x 4
# Name `000` xxx Others
# <chr> <int> <int> <int>
# 1 a1 2 2 1
# 2 a2 1 1 1
# 3 a3 1 1 0
Note how rowwise and grouping by row work slightly differently.
Here's a base R translation too:
do.call(
rbind,
by(df,df$Name,function(x) data.frame(
Name = x$Name[1],
`000` = sum(x[-1]=='000',na.rm=T),
xxx = sum(x[-1]=='xxx',na.rm=T),
Others = sum(x[-1]!='000' & x[-1]!='xxx',na.rm=T))))
# Name X000 xxx Others
# a1 a1 2 2 1
# a2 a2 1 1 1
# a3 a3 1 1 0
If I understood it correctly, and the task is to count all xxx, 000, and !000&!xxx by Name we can also use base::table() to obtain desired output:
df <- data.frame(Name = c("a1", "a1", "a2", "a3"),
per1 = c("000", "xxx", "000", "000"),
per2 = c("xxx", "000", 340, "xxx"),
per3 = c(230, NA, "xxx", NA),
stringsAsFactors = F
)
Vals <- unlist(df[,-1]) # convert to the vector
Vals[!(Vals %in% c("000", "xxx")) & !is.na(Vals)] <- "Others" # !(xxx|000) <- Others
#
as.data.frame.matrix( # group by Name, count
table(rep(df$Name, ncol(df) - 1), Vals, useNA = "no") # don't count NAs
) # convert to data.frame
# 000 Others xxx
#a1 2 1 2
#a2 1 1 1
#a3 1 0 1
I have a dataframedf1 with columns a,b,c. I want to assign c=0 to the first row of the dataset returned by group_by(a,b). I tried something like
t <- df1 %>% group_by(a,b) %>% filter(row_number(a)==1) %>% mutate(c= 0)
But it reduced number of rows. Expected output is
a b c
a1 b1 0
a1 b1 NA
a2 b2 0
a2 b2 NA
You can use seq_along to number elements in each group from 1 to the total number of elements within each group (2, in this case). Then use ifelse to set the first element of 'c' for each group to be 0 and leave the other element as is.
library(dplyr)
df %>%
group_by(a, b) %>%
mutate(c = ifelse(seq_along(c) == 1, 0, c))
# A tibble: 4 x 3
# Groups: a, b [2]
# a b c
# <fct> <fct> <dbl>
#1 a1 b1 0.
#2 a1 b1 NA
#3 a2 b2 0.
#4 a2 b2 NA
data
df <- data.frame(a = rep(c("a1", "a2"), each = 2),
b = rep(c("b1", "b2"), each = 2),
c = NA)
df
# a b c
#1 a1 b1 NA
#2 a1 b1 NA
#3 a2 b2 NA
#4 a2 b2 NA