Related
I have a data frame which looks like this:
df
colA colB
0 0
1 1
0 1
0 1
0 1
1 0
0 0
1 1
0 1
I would like to convert a certain proportion of the 0 in colA to NA and a certain proportion of 1 in colB to NA
if I do this:
df["colA"][df["colA"] == 0] <- NA
all the 0 in columns A will be converted to NA, however I just want half of them to be converted
Similarly, for colB I want only 1/3 of the 1 to be converted:
df["colB"][df["colB"] == 1] <- NA
Expected output:
colA colB
0 0
1 1
NA 1
0 1
NA 1
1 0
0 0
1 NA
NA NA
One way
tmp=which(df["colA"]==0)
df$colA[sample(tmp,round(length(tmp)/2))]=NA
similar for colB
tmp=which(df["colB"]==1)
df$colB[sample(tmp,round(length(tmp)/3))]=NA
You can use prodNA from the missForest package
set.seed(1)
library(missForest)
df[df$colA == 0, "colA"] <- prodNA(df[df$colA == 0, "colA", drop=F], noNA = 0.5)
df[df$colB == 1, "colB"] <- prodNA(df[df$colB == 1, "colB", drop=F], noNA = 1/3)
df
colA colB
1 NA 0
2 1 NA
3 0 NA
4 NA 1
5 NA 1
6 1 0
7 0 0
8 1 1
9 0 1
I'll contribute a tidyverse approach here.
library(tidyverse)
df %>% mutate(id_colA = ifelse(colA == 1, NA, 1:n()),
colA = ifelse(id_colA %in% sample(na.omit(id_colA), sum(!is.na(id_colA))/2), NA, colA),
id_colB = ifelse(colB == 0, NA, 1:n()),
colB = ifelse(id_colB %in% sample(na.omit(id_colB), sum(!is.na(id_colB))/3), NA, colB)) %>%
select(-starts_with("id_"))
I have a data frame like the one below...
df <- data.frame(B1994 = c(1,0,0,0,1,0,0,1,1,0),
B1995 = c(1,1,1,0,0,1,1,1,0,0),
B1996 = c(0,0,0,0,0,0,1,1,1,0),
B1997 = c(1,0,1,0,0,1,0,1,1,1),
B1998 = c(1,0,0,0,1,0,1,0,0,1)
)
I am now trying to calculate the longest consecutive sequence of 0's across all of the columns (for each row) in this data frame, and populate a new column with these values, like this data frame below...
df2 <- data.frame(B1994 = c(1,0,0,0,1,0,0,1,1,0),
B1995 = c(1,1,1,0,0,1,1,1,0,0),
B1996 = c(0,0,0,0,0,0,1,1,1,0),
B1997 = c(1,0,1,0,0,1,0,1,1,1),
B1998 = c(1,0,0,0,1,0,1,0,0,1),
Longest_0_Interval = c(1,3,1,5,3,1,1,1,1,3)
)
Is there an easy solution for this in R?
You can use rle()
df <- data.frame(B1994 = c(1,0,0,0,1,0,0,1,1,0),
B1995 = c(1,1,1,0,0,1,1,1,0,0),
B1996 = c(0,0,0,0,0,0,1,1,1,0),
B1997 = c(1,0,1,0,0,1,0,1,1,1),
B1998 = c(1,0,0,0,1,0,1,0,0,1)
)
maxl0 <- function(x) {
r <- rle(x)
i0 <- which(r$values==0) ## or i0 <- r$values==0
max(r$lengths[i0])
}
df$Longest_0_Interval <- apply(df, 1, maxl0)
One dplyr option could be:
df %>%
rowwise() %>%
mutate(Longest_0_Interval = with(rle(c_across(everything())), max(lengths[values == 0])))
B1994 B1995 B1996 B1997 B1998 Longest_0_Interval
<dbl> <dbl> <dbl> <dbl> <dbl> <int>
1 1 1 0 1 1 1
2 0 1 0 0 0 3
3 0 1 0 1 0 1
4 0 0 0 0 0 5
5 1 0 0 0 1 3
6 0 1 0 1 0 1
7 0 1 1 0 1 1
8 1 1 1 1 0 1
9 1 0 1 1 0 1
10 0 0 0 1 1 3
Split a string, build columns with unique values, and fill values according to string.
Sample data.table:
library(data.table)
(dt <- data.table(id = as.numeric(1:5),
x = c(NA, "ab.cde", "co.hij.ab", "cox.cde.kl", NA)))
dcast Approach: close but not quite
dcast(dt, id ~ x, value.var = "id")
dt[dcast(dt, id ~ x, value.var = "id"), on=.(id = id)]
dcast buils some columns and fills some values, but it doesn't do what I want.
string split Approach: I can't transpose
dt[, unique(unlist(strsplit(dt$x, ".", fixed = TRUE))) :=
tstrsplit(dt$x, ".", fixed = TRUE)]
the message says that my LHS has 7 columns while my RHS only has 3. So transposing doesn't work. Maybe I can build the columns and fill the values later:
dt[, unique(unlist(strsplit(dt$x, ".", fixed = TRUE))) := character()]
And now i'm getting close but still not there. I need to fill those columns with 1 and 0s according to a match (or something) on dt$x;
id 1 should have a 1 on column: NA
id 2 should have a 1 on columns: ab, and cde
id 3 should have a 1 on columns: co, hij, and ab
id 4 should have a 1 on columns: cox, cde, and kl
id 5 should have a 1 on column: NA
We can use data.table methods i.e. dcast
library(data.table)
dcast(dt[, {x1 <- strsplit(x, "\\."); c(list(unlist(x1)),
.SD[rep(seq_len(.N), lengths(x1))])}], id + x ~ V1, length)
# id x NA ab cde co cox hij kl
#1: 1 <NA> 1 0 0 0 0 0 0
#2: 2 ab.cde 0 1 1 0 0 0 0
#3: 3 co.hij.ab 0 1 0 1 0 1 0
#4: 4 cox.cde.kl 0 0 1 0 1 0 1
#5: 5 <NA> 1 0 0 0 0 0 0
One option using dplyr and tidyr is to split the string on "." and put it into separate rows and then spread it into wide format.
library(dplyr)
library(tidyr)
dt %>%
mutate(x1 = x) %>%
separate_rows(x, sep = "\\.") %>%
mutate(temp = 1) %>%
spread(x, temp, fill = 0)
# id x1 ab cde co cox hij kl <NA>
#1 1 <NA> 0 0 0 0 0 0 1
#2 2 ab.cde 1 1 0 0 0 0 0
#3 3 co.hij.ab 1 0 1 0 1 0 0
#4 4 cox.cde.kl 0 1 0 1 0 1 0
#5 5 <NA> 0 0 0 0 0 0 1
I have a data.frame with vegetation in a presence-abscence matrix and ELLENBERG-values about moisture (values 1-9 and indicator plants (! and =)). Now I want to count the plants in every column (observation point) and for each ELLENBERG-value.
T1 -T4 are my observation points and when the plant is present, the value is 1, if absent 0. In F_nr are my ELLENBERG Values from 1 to 9. In F_sym the indicators with ! and =. In my output I count the values, i. e. in T1 I have one plants with 4, two with 7, one with ! and one with =.
Here some small example data:
set.seed(1)
df <- df2 <- data.frame(name=c("Acer campestre", "Acer negundo", "Achillea millefolium agg.", "Agrostis stolonifera", "Alnus glutinosa", "Alnus incana"),
T1=rbinom(6, 1, .5), T2=rbinom(6, 1, .5), T3=rbinom(6, 1, .5), T4=rbinom(6, 1, .5),
F_Nr=c(5,6,4,7,9,7), F_sym=c(NA, NA, NA, "!","=", "="))
I excpect a matrix like this, to create plots about the distribution of the values.
df_count <- data.frame(F_sum=c(1,2,3,4,5,6,7,8,9,"=", "!"),
T1=c(0,0,0,1,0,0,2,0,0,1,0),
T2=c(0,0,0,1,1,1,0,0,0,0,0),
T3=c(0,0,0,1,1,0,1,0,1,1,1),
T4=c(0,0,0,1,0,1,0,0,1,1,0))
Thanks for your help
We can use a combination of aggregate() and merge().
df2 <- read.table(text="
name T1 T2 T3 T4 F_Nr F_sym
'Acer campestre' 0 1 1 0 5 <NA>
'Acer negundo' 0 1 0 1 6 <NA>
'Achillea millefolium agg.' 1 1 1 1 4 <NA>
'Agrostis stolonifera' 1 0 0 0 7 !
'Alnus glutinosa' 0 0 1 1 9 =
'Alnus incana' 1 0 1 0 7 =",
header=TRUE, stringsAsFactors=FALSE)
fnr <- aggregate(df2[,2:5], list(df2$F_Nr), sum)
fsm <- aggregate(df2[,2:5], list(df2$F_sym), sum)
counts0 <- rbind(fnr, fsm)
dtf <- data.frame(F_sum=c(1:9, "=", "!"), stringsAsFactors=FALSE)
counts <- merge(dtf, counts0, by.x="F_sum", by.y="Group.1", all.x=TRUE)
counts[is.na(counts)] <- 0
counts[match(dtf$F_sum, counts$F_sum), ]
# F_sum T1 T2 T3 T4
# 3 1 0 0 0 0
# 4 2 0 0 0 0
# 5 3 0 0 0 0
# 6 4 1 1 1 1
# 7 5 0 1 1 0
# 8 6 0 1 0 1
# 9 7 2 0 1 0
# 10 8 0 0 0 0
# 11 9 0 0 1 1
# 2 = 1 0 2 1
# 1 ! 1 0 0 0
I have a df that looks like this.
Date Winner
4/12 Tom
4/13 Abe
4/14 George
4/15 Tom
I would like to add new columns that assign a 1 if if the name appears in the winner column and 0 if the name did not appear and vice versa. Ideally the df would look like this as a result
Date Winner Tom_Win Tom_Lose Abe_Win Abe_Lose George_Win George Lose
4/12 Tom 1 0 0 1 0 1
4/13 Abe 0 1 1 0 0 1
4/14 George 0 1 0 1 1 0
4/15 Tom 1 0 0 1 0 1
Is there an easy way to accomplish this?
This is extremely simple to do if you use the model.matrix functions, it will create N dummy columns with 0 when the name does not appear and one when it does (exactly as you requested), the code below:
(assuming your data is called db)
> winners <- model.matrix(~Winner - 1, data=db)
> winners
WinnerAbe WinnerGeorge WinnerTom
1 0 0 1
2 1 0 0
3 0 1 0
4 0 0 1
This bit is to compute the columns with the losing values
winners <- as.data.frame(winners)
winners$loserAbe <- as.numeric(!winners$WinnerAbe) #naturally you have to
#do this for every column you need
WinnerAbe WinnerGeorge WinnerTom loserAbe
1 0 0 1 1
2 1 0 0 0
3 0 1 0 1
4 0 0 1 1
winners$Date <- db$Date #this last bit so you don't lose the date.
Using mtabulate from qdapTools package we can do the following three steps,
library(qdapTools)
d1 <- mtabulate(d3$Winner)
d2 <- setNames(data.frame(sapply(d1, function(i) ifelse(i == 1, 0, 1))),
paste0(names(d1), '_Lose'))
cbind(d3$Date, d1, d2)
# d3$Date Abe George Tom Abe_Lose George_Lose Tom_Lose
#1 4/12 0 0 1 1 1 0
#2 4/13 1 0 0 0 1 1
#3 4/14 0 1 0 1 0 1
#4 4/15 0 0 1 1 1 0
DATA
str(d3)
'data.frame': 4 obs. of 2 variables:
$ Date : Factor w/ 4 levels "4/12","4/13",..: 1 2 3 4
$ Winner: Factor w/ 3 levels "Abe","George",..: 3 1 2 3
I'm sure there is a better way than this but this works in base R and it's fairly simple:
If your data looks like this:
df <- data.frame(Date = c("4/12","4/13","4/14","4/15"),Winner = c("Tom","Abe","George","Tom"))
Append the extra columns like so:
xcols <- c(paste0(unique(df$Winner), '_Win'), paste0(unique(df$Winner), '_Lose'))
df[ , xcols] <- 0
Now make a character vector with instructions to give the points for every player.
evl <- unlist(lapply(unique(df$Winner), function(x){paste0('df[', which(df$Winner == x), ',', which(names(df) == paste0(x, '_Win')), '] <- 1')}))
And execute the code:
eval(parse(text = evl))
df <- data.frame(
Date = c("4/12", "4/13","4/14", "4/15"),
Winner = c("Tom", "Abe", "George", "Tom")
)
df2 <- do.call(cbind,
lapply(seq_along(levels(df$Winner)), function(x) {
win <- ifelse(df$Winner == levels(df$Winner)[x], 1, 0)
lose <- ifelse(df$Winner == levels(df$Winner)[x], 0, 1)
dat <- cbind(win, lose)
colnames(dat) <- c(paste(levels(df$Winner)[x], "win", sep = "_"), paste(levels(df$Winner)[x], "lose", sep = "_"))
dat
})
)
cbind(df, df2)
> cbind(df, df2)
Date Winner Abe_win Abe_lose George_win George_lose Tom_win Tom_lose
1 4/12 Tom 0 1 0 1 1 0
2 4/13 Abe 1 0 0 1 0 1
3 4/14 George 0 1 1 0 0 1
4 4/15 Tom 0 1 0 1 1 0