Related
Let's say I have this data frame. How would I go about removing only the NA values associated with name a without physically removing them manually?
a 1 4
a 7 3
a NA 4
a 6 3
a NA 4
a NA 3
a 2 4
a NA 3
a 1 4
b NA 2
c 3 NA
I've tried using the function !is.na, but that removes all the NA values in the column ID1 for all the names. How would I specifically target the ones that are associated with name a?
You could subset your data frame as follows:
df_new <- df[!(df$name == "a" & is.na(df$ID1)), ]
This can also be written as:
df_new <- df[df$name != "a" | !is.na(df$ID1), ]
With dplyr:
library(dplyr)
df %>%
filter(!(name == "a" & is.na(ID1)))
Or with subset:
subset(df, !(name == "a" & is.na(ID1)))
Output
name ID1 ID2
1 a 1 4
2 a 7 3
3 a 6 3
4 a 2 4
5 a 1 4
6 b NA 2
7 c 3 NA
Data
df <- structure(list(name = c("a", "a", "a", "a", "a", "a", "a", "a",
"a", "b", "c"), ID1 = c(1L, 7L, NA, 6L, NA, NA, 2L, NA, 1L, NA,
3L), ID2 = c(4L, 3L, 4L, 3L, 4L, 3L, 4L, 3L, 4L, 2L, NA)), class = "data.frame", row.names = c(NA,
-11L))
This question already has answers here:
Calculate the mean by group
(9 answers)
Closed 2 years ago.
I need to apply a function to several subsets of data of differing lengths within a column and generate a new data frame which includes the outputs and their associated metadata.
How can I do this without recourse to for loops? tapply() seems like a good place to start, but I struggle with the syntax.
For example -- I have something like this:
block plot id species type response
1 1 1 w a 1.5
1 1 2 w a 1
1 1 3 w a 2
1 1 4 w a 1.5
1 2 5 x a 5
1 2 6 x a 6
1 2 7 x a 7
1 3 8 y b 10
1 3 9 y b 11
1 3 10 y b 9
1 4 11 z b 1
1 4 12 z b 3
1 4 13 z b 2
2 5 14 w a 0.5
2 5 15 w a 1
2 5 16 w a 1.5
2 6 17 x a 3
2 6 18 x a 2
2 6 19 x a 4
2 7 20 y b 13
2 7 21 y b 12
2 7 22 y b 14
2 8 23 z b 2
2 8 24 z b 3
2 8 25 z b 4
2 8 26 z b 2
2 8 27 z b 4
And I want to produce something like this:
block plot species type mean.response
1 1 w a 1.5
1 2 x a 6
1 3 y b 10
1 4 z b 2
2 5 w a 1
2 6 x a 3
2 7 y b 13
2 8 z b 3
Try this. You can use group_by() to set the grouping variables and then summarise() to compute the expected variable. Here the code using dplyr:
library(dplyr)
#Code
newdf <- df %>% group_by(block,plot,species,type) %>% summarise(Mean=mean(response,na.rm=T))
Output:
# A tibble: 8 x 5
# Groups: block, plot, species [8]
block plot species type Mean
<int> <int> <chr> <chr> <dbl>
1 1 1 w a 1.5
2 1 2 x a 6
3 1 3 y b 10
4 1 4 z b 2
5 2 5 w a 1
6 2 6 x a 3
7 2 7 y b 13
8 2 8 z b 3
Or using base R (-3 is used to omit id variable in the aggregation):
#Base R
newdf <- aggregate(response~.,data=df[,-3],mean,na.rm=T)
Output:
block plot species type response
1 1 1 w a 1.5
2 2 5 w a 1.0
3 1 2 x a 6.0
4 2 6 x a 3.0
5 1 3 y b 10.0
6 2 7 y b 13.0
7 1 4 z b 2.0
8 2 8 z b 3.0
Some data used:
#Data
df <- structure(list(block = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L), plot = c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L, 4L,
4L, 4L, 5L, 5L, 5L, 6L, 6L, 6L, 7L, 7L, 7L, 8L, 8L, 8L, 8L, 8L
), id = 1:27, species = c("w", "w", "w", "w", "x", "x", "x",
"y", "y", "y", "z", "z", "z", "w", "w", "w", "x", "x", "x", "y",
"y", "y", "z", "z", "z", "z", "z"), type = c("a", "a", "a", "a",
"a", "a", "a", "b", "b", "b", "b", "b", "b", "a", "a", "a", "a",
"a", "a", "b", "b", "b", "b", "b", "b", "b", "b"), response = c(1.5,
1, 2, 1.5, 5, 6, 7, 10, 11, 9, 1, 3, 2, 0.5, 1, 1.5, 3, 2, 4,
13, 12, 14, 2, 3, 4, 2, 4)), class = "data.frame", row.names = c(NA,
-27L))
Use any of these where the input dd is given reproducibly in the Note at the end:
# 1. aggregate.formula - base R
# Can use just response on left hand side if header doesn't matter.
aggregate(cbind(mean.response = response) ~ block + plot + species + type, dd, mean)
# 2. aggregate.default - base R
v <- c("block", "plot", "species", "type")
aggregate(list(mean.response = dd$response), dd[v], mean)
# 3. sqldf
library(sqldf)
sqldf("select block, plot, species, type, avg(response) as [mean.response]
from dd group by 1, 2, 3, 4")
# 4. data.table
library(data.table)
v <- c("block", "plot", "species", "type")
as.data.table(dd)[, .(mean.response = mean(response)), by = v]
# 5. doBy - last column of output will be labelled response.mean
library(doBy)
summaryBy(response ~ block + plot + species + type, dd)
Note
The input in reproducible form:
dd <- structure(list(block = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L), plot = c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L, 4L,
4L, 4L, 5L, 5L, 5L, 6L, 6L, 6L, 7L, 7L, 7L, 8L, 8L, 8L, 8L, 8L
), id = 1:27, species = c("w", "w", "w", "w", "x", "x", "x",
"y", "y", "y", "z", "z", "z", "w", "w", "w", "x", "x", "x", "y",
"y", "y", "z", "z", "z", "z", "z"), type = c("a", "a", "a", "a",
"a", "a", "a", "b", "b", "b", "b", "b", "b", "a", "a", "a", "a",
"a", "a", "b", "b", "b", "b", "b", "b", "b", "b"), response = c(1.5,
1, 2, 1.5, 5, 6, 7, 10, 11, 9, 1, 3, 2, 0.5, 1, 1.5, 3, 2, 4,
13, 12, 14, 2, 3, 4, 2, 4)), class = "data.frame", row.names = c(NA,
-27L))
This is my dataframe:
df<-list(structure(list(Col1 = structure(1:6, .Label = c("A", "B",
"C", "D", "E", "F"), class = "factor"), Col2 = structure(c(1L,
2L, 3L, 2L, 4L, 5L), .Label = c("B", "C", "D", "F", "G"), class = "factor")), class = "data.frame", row.names = c(NA,
-6L)), structure(list(Col1 = structure(c(1L, 4L, 5L, 6L, 2L,
3L), .Label = c("A", "E", "H", "M", "N", "P"), class = "factor"),
Col2 = structure(c(1L, 2L, 3L, 2L, 4L, 5L), .Label = c("B",
"C", "D", "F", "G"), class = "factor")), class = "data.frame", row.names = c(NA,
-6L)), structure(list(Col1 = structure(c(1L, 4L, 6L, 5L, 2L,
3L), .Label = c("A", "W", "H", "M", "T", "U"), class = "factor"),
Col2 = structure(c(1L, 2L, 3L, 2L, 4L, 5L), .Label = c("B",
"C", "D", "S", "G"), class = "factor")), class = "data.frame", row.names = c(NA,
-6L)))
I want to extract col1=df[[1]][1] as a dataframe. Then col1 of the second position of this list I want to merge to the df[[1]][1], then I will have a dataframe with 2 columns.
After this I want to merge the column 1 of the third position of the list to the dataframe with two columns, then I will have a dataframe with 3 columns.
In other words my dataframe should have 3 columns, all the first columns of each entry of my list.
The dplyr package can helpme to do this?
Any help?
You can use lapply to extract the three columns named "Col1 in one go. Then set the names of the result.
col1 <- as.data.frame(lapply(df, '[[', "Col1"))
names(col1) <- letters[seq_along(col1)]
col1
# a b c
#1 A A A
#2 B M M
#3 C N U
#4 D P T
#5 E E W
#6 F H H
Choose any other column names that you might find better.
A dplyr way could be
df %>%
unlist(recursive = FALSE) %>%
as.data.frame %>%
select(., starts_with("Col1"))
# Col1 Col1.1 Col1.2
#1 A A A
#2 B M M
#3 C N U
#4 D P T
#5 E E W
#6 F H H
With map_dfc from purrr:
library(purrr)
map_dfc(df, `[`, 1)
Output:
Col1 Col11 Col12
1 A A A
2 B M M
3 C N U
4 D P T
5 E E W
6 F H H
Alternative use of map_dfc making use of purrr's concise element extraction syntax that allows specifying elements of elements by name or position. The first is, for example, equivalent to
map_dfc(df, `[[`, 1)
which differs from the use of [ in that the columns will not be named variations of Col1 and just get V names instead, which may be desirable since names like Col11 and Col12 may be confusing.
df <- list(structure(list(Col1 = structure(1:6, .Label = c("A", "B", "C", "D", "E", "F"), class = "factor"), Col2 = structure(c(1L, 2L, 3L, 2L, 4L, 5L), .Label = c("B", "C", "D", "F", "G"), class = "factor")), class = "data.frame", row.names = c(NA, -6L)), structure(list(Col1 = structure(c(1L, 4L, 5L, 6L, 2L, 3L), .Label = c("A", "E", "H", "M", "N", "P"), class = "factor"), Col2 = structure(c(1L, 2L, 3L, 2L, 4L, 5L), .Label = c("B", "C", "D", "F", "G"), class = "factor")), class = "data.frame", row.names = c(NA, -6L)), structure(list(Col1 = structure(c(1L, 4L, 6L, 5L, 2L, 3L), .Label = c("A", "W", "H", "M", "T", "U"), class = "factor"), Col2 = structure(c(1L, 2L, 3L, 2L, 4L, 5L), .Label = c("B", "C", "D", "S", "G"), class = "factor")), class = "data.frame", row.names = c(NA, -6L)))
library(purrr)
map_dfc(df, 1)
#> # A tibble: 6 x 3
#> V1 V2 V3
#> <fct> <fct> <fct>
#> 1 A A A
#> 2 B M M
#> 3 C N U
#> 4 D P T
#> 5 E E W
#> 6 F H H
map_dfc(df, "Col1")
#> # A tibble: 6 x 3
#> V1 V2 V3
#> <fct> <fct> <fct>
#> 1 A A A
#> 2 B M M
#> 3 C N U
#> 4 D P T
#> 5 E E W
#> 6 F H H
Created on 2018-09-19 by the reprex package (v0.2.0).
res<-1:nrow(df[[1]][1])
for(i in 1:length(df)){
print ( as.vector(df[[i]][1]))
res<-cbind(res,as.data.frame(df[[i]][1]))
}
res$res<-NULL
So, the output is:
Col1 Col1 Col1
1 A A A
2 B M M
3 C N U
4 D P T
5 E E W
6 F H H
Using dplyr
library(dplyr)
df %>%
sapply('[[',1) %>%
as.data.frame
#returns
V1 V2 V3
1 A A A
2 B M M
3 C N U
4 D P T
5 E E W
6 F H H
I have the following data frames:
# df1
id cg_v
1 a
2 b
3 a b
4 b c
5 b c d
6 d
# df2
id cg
1 a
2 b
3 a
3 b
4 b
4 c
5 b
5 c
5 d
6 d
I need to add a column to df1 that contains the mean covariance computed across each pair of elements in cg_v. If cg_v contains only one element, then I would like the new column to contain its variance.
I can get a covariance matrix by cov(crossprod(table(df2)))
# a b c d
a 0.9166667 0.0000000 -0.5833333 -0.6666667
b 0.0000000 2.0000000 1.0000000 0.0000000
c -0.5833333 1.0000000 0.9166667 0.3333333
d -0.6666667 0.0000000 0.3333333 0.6666667
What do I do from here?
The end result should be like this:
# df1
id cg_v cg_cov
1 a 0.9166667
2 b 2.0000000
3 a b 0.0000000
4 b c 1.0000000
5 b c d 0.4444444 # This is equal to (1.0000000 + 0.3333337 + 0.0000000)/3
6 d 0.6666667
Code to generate df1 and df2:
df1 <- structure(list(id = c(1L, 2L, 3L, 4L, 5L, 6L),
cg_v = c("a", "b", "a b", "b c", "b c d", "d")),
.Names = c("id", "cg_v"),
class = "data.frame", row.names = c(NA, -6L))
df2 <- structure(list(id = c(1L, 2L, 3L, 3L, 4L, 4L, 5L, 5L, 5L, 6L),
cg = c("a", "b", "a", "b", "b", "c", "b", "c", "d", "d")),
.Names = c("id", "cg"),
class = "data.frame", row.names = c(NA, -10L))
I think I found a solution for this problem using data.tables and reshape. What do you want to do with the three letters b c d? I assumed that you want to have the covariance of the first two letters:
require(reshape)
require(data.table)
dt1 <- data.table(id = c(1L, 2L, 3L, 4L, 5L, 6L),
cg_v = c("a", "b", "a b", "b c", "b c d", "d"))
dt2 <- data.table(id = c(1L, 2L, 3L, 3L, 4L, 4L, 5L, 5L, 5L, 6L),
cg = c("a", "b", "a", "b", "b", "c", "b", "c", "d", "d"))
cov_dt <- data.table(melt(cov(crossprod(table(df2)))))
dt1 <- cbind(dt1, t(sapply(strsplit(as.character(df1$cg_v), " "), function(x)x[1:2])))
#replace the na with the first colomn
dt1[is.na(V2), V2 := V1]
# Merge them on two columns
setkey(dt1, "V1", "V2")
setkey(cov_dt, "X1", "X2")
result <- cov_dt[dt1]
> result[,.(id, cg_v, value)]
id cg_v value
1: 1 a 0.9166667
2: 3 a b 0.0000000
3: 2 b 2.0000000
4: 4 b c 1.0000000
5: 5 b c d 1.0000000
6: 6 d 0.6666667
Variant which also works if there are more than 2 letters (not the most efficient code):
require(reshape)
require(combinat)
df1 <- data.frame(id = c(1L, 2L, 3L, 4L, 5L, 6L),
cg_v = c("a", "b", "a b", "b c", "b c d", "d"))
df2 <- data.frame(id = c(1L, 2L, 3L, 3L, 4L, 4L, 5L, 5L, 5L, 6L),
cg = c("a", "b", "a", "b", "b", "c", "b", "c", "d", "d"))
cov_dt <- cov(crossprod(table(df2)))
mat <- sapply(strsplit(as.character(df1$cg_v), " "), function(x) if(length(x) == 1){c(x,x)} else(x))
# Should be all minimal 2
sapply(mat, length) > 1
mat <- sapply(mat, function(x) matrix(combn(x,2), nrow = 2))
df1$cg_cov <- sapply(mat, function(x) mean(apply(x,2, function(x) cov_dt[x[1],x[2]])))
> df1
id cg_v cg_cov
1 1 a 0.9166667
2 2 b 2.0000000
3 3 a b 0.0000000
4 4 b c 1.0000000
5 5 b c d 0.4444444
6 6 d 0.6666667
I would like to write a function or loop that will create three new columns, then fill these columns with either the same value or a specified value, if the value in the original column is within one of three specified lists.
For example, here is what the data looks like:
> data
a1 a2 a3
1 C C A
2 A B_20 B_20
3 A C B_30
4 C C B_40
5 C A A
6 B_60 B_60 B_60
7 A A C
8 A C B_80
9 B_90 C B_90
I want to create three new columns (a1_t, a2_t, a3_t) where if a1 is in list1
list1 <-c('B_10','B_20','B_30')
then fill in a1_t, with B_00_30
or if a1 is in list2
list2 <-c('B_40','B_50','B_60')
then fill in a1_t, with B_40_60
or if a1 is in list3
list3 <-c('B_70','B_80','B_90')
then fill in a1_t, with B_70_90
if not in list1, list2, or list3, then place the value from a1 to a1_t.
Then iterate through this same procedure for a2_t and a3_t using a2 and a3 for matching.
In the end I would like the output to look like this:
> data
a1 a2 a3 a1_t a2_t a3_t
1 A A B_10 A A B_00_30
2 B_20 A C B_00_30 A C
3 B_30 A C B_00_30 A C
4 C C A C C A
5 A B_50 B_50 A B_40_60 B_40_60
6 C C A C C A
7 C B_70 A C B_70_90 A
8 B_80 C B_80 B_70_90 C B_70_90
9 B_90 C A B_70_90 C A
To create original raw data:
data <- structure(list(a1 = c("A", "B_20", "B_30", "C", "A", "C", "C",
"B_80", "B_90"), a2 = c("A", "A", "A", "C", "B_50", "C", "B_70",
"C", "C"), a3 = c("B_10", "C", "C", "A", "B_50", "A", "A", "B_80",
"A")), class = "data.frame", .Names = c("a1", "a2", "a3"), row.names = c(NA,
-9L))
To create desired output data:
data <- structure(list(a1 = structure(c(1L, 2L, 3L, 6L, 1L, 6L, 6L, 4L, 5L), .Label = c("A", "B_20", "B_30", "B_80", "B_90", "C"), class = "factor"),
a2 = structure(c(1L, 1L, 1L, 4L, 2L, 4L, 3L, 4L, 4L), .Label = c("A", "B_50", "B_70", "C"), class = "factor"),
a3 = structure(c(2L, 5L, 5L, 1L, 3L, 1L, 1L, 4L, 1L), .Label = c("A", "B_10", "B_50", "B_80", "C"), class = "factor"),
a1_t = structure(c(1L, 2L, 2L, 4L, 1L, 4L, 4L, 3L, 3L), .Label = c("A", "B_00_30", "B_70_90", "C"), class = "factor"),
a2_t = structure(c(1L, 1L, 1L, 4L, 2L, 4L, 3L, 4L, 4L), .Label = c("A", "B_40_60", "B_70_90", "C"), class = "factor"),
a3_t = structure(c(2L, 5L, 5L, 1L, 3L, 1L, 1L, 4L, 1L), .Label = c("A", "B_00_30", "B_40_60", "B_70_90", "C"), class = "factor")),
.Names = c("a1", "a2", "a3", "a1_t", "a2_t", "a3_t"), class = "data.frame", row.names = c(NA, -9L))
Thanks
-al
Final working code w/ answer:
library(dplyr)
list1 <-c('B_10','B_20','B_30')
list2 <-c('B_40','B_50','B_60')
list3 <-c('B_70','B_80','B_90')
lookup = rbind(cbind(list = list1, val = "B_00_30"),
cbind(list2, "B_40_60"),
cbind(list3, "B_70_90"))
g <- sapply(data, function(x) {
tmp = lookup[, 2][match(x, lookup[, 1])]
ifelse(is.na(tmp), x, tmp)
})
gd <- as.data.frame (g)
gd <- mutate (gd,a1_t=a1,a2_t=a2,a3_t=a3)
gd <- select (gd,a1_t,a2_t,a3_t)
h <- cbind (data,gd)
> h
a1 a2 a3 a1_t a2_t a3_t
1 A A B_10 A A B_00_30
2 B_20 A C B_00_30 A C
3 B_30 A C B_00_30 A C
4 C C A C C A
5 A B_50 B_50 A B_40_60 B_40_60
6 C C A C C A
7 C B_70 A C B_70_90 A
8 B_80 C B_80 B_70_90 C B_70_90
9 B_90 C A B_70_90 C A
A way could be:
lookup = rbind(cbind(list = list1, val = "B_00_30"),
cbind(list2, "B_40_60"),
cbind(list3, "B_70_90"))
sapply(data, function(x) {
tmp = lookup[, 2][match(x, lookup[, 1])]
ifelse(is.na(tmp), x, tmp)
})
# a1 a2 a3
# [1,] "A" "A" "B_00_30"
# [2,] "B_00_30" "A" "C"
# [3,] "B_00_30" "A" "C"
# [4,] "C" "C" "A"
# [5,] "A" "B_40_60" "B_40_60"
# [6,] "C" "C" "A"
# [7,] "C" "B_70_90" "A"
# [8,] "B_70_90" "C" "B_70_90"
# [9,] "B_70_90" "C" "A"
Then you can cbind to "data" and coerce to "data.frame" as needed.
Another way using cut
indx <- cut(as.numeric(gsub(".\\_","",as.matrix(data))),breaks=c(0,30,60,90),labels=F)
(Here, you will get a warning message because as.numeric on those elements that are characters will coerce them to NAs, which was my intention.)
or using info from list1:list3
val <- sapply(mget(ls(pattern="list")),function(x) max(as.numeric(gsub("._","",x))))
val
# list1 list2 list3
# 30 60 90
#indx <- cut(as.numeric(gsub(".\\_","",as.matrix(data))),breaks=c(0,val),labels=F)
indx[!is.na(indx)] <- c("B_00_30","B_40_60", "B_70_90")[indx[!is.na(indx)]]
indx[is.na(indx)] <- unlist(data)[!grepl("_", unlist(data))]
data1 <- data
data1[] <- indx
colnames(data1) <- paste(colnames(data1),"t",sep="_")
Update
To avoid the warning message, you could do:
m1 <- as.matrix(data)
indx <- grepl("\\d",gsub(".\\_","",m1))
indx1 <- cut(as.numeric(gsub(".\\_","",m1[indx])),breaks=c(0,30,60,90),labels=F)
m1[indx] <- c("B_00_30", "B_40_60", "B_70_90")[indx1]
data1 <- data
data1[] <- m1
colnames(data1) <- paste(colnames(data1),"t",sep="_")
cbind(data, data1)
# a1 a2 a3 a1_t a2_t a3_t
# 1 A A B_10 A A B_00_30
# 2 B_20 A C B_00_30 A C
# 3 B_30 A C B_00_30 A C
# 4 C C A C C A
# 5 A B_50 B_50 A B_40_60 B_40_60
# 6 C C A C C A
# 7 C B_70 A C B_70_90 A
# 8 B_80 C B_80 B_70_90 C B_70_90
# 9 B_90 C A B_70_90 C A