DF <- data.frame(x1=c(NA,7,7,8,NA), x2=c(1,4,NA,NA,4)) # a data frame with NA
WhereAreMissingValues <- which(is.na(DF), arr.ind=TRUE) # find the position of the missing values
Modes <- apply(DF, 2, function(x) {which(tabulate(x) == max(tabulate(x)))}) # find the modes of each column
DF
WhereAreMissingValues
Modes
I would like to replace the NAs of each column of DF with the mode, accordingly.
Please for some help.
Map provides here a one line solution:
data.frame(Map(function(u,v){u[is.na(u)]=v;u},DF, Modes))
# x1 x2
#1 7 1
#2 7 4
#3 7 4
#4 8 4
#5 7 4
Here's how I would do this.
First I'll define an helper function
Myfunc <- function(x) as.numeric(names(sort(-table(x)))[1L])
Then just use lapply over the data set
DF[] <- lapply(DF, function(x){x[is.na(x)] <- Myfunc(x) ; x})
DF
# x1 x2
# 1 7 1
# 2 7 4
# 3 7 4
# 4 8 4
# 5 7 4
Related
I want to save residuals from a linear model to a dataframe. I was trying to do it with the line of code (note that this was supposed to go inside a loop):
resi <- NULL
resi <- cbind(resi, colnames(dados[1])=residuals(m))
Here I intended to save the residuals vector from my model m under the same column name from the dados object (which is basicaly a date), but I get the error:
Error: unexpected '=' in "resi <- cbind(resi, colnames(dados[1])="
You want `colnames <- ()`.
cbind(d, `colnames<-`(d, letters[1:4]))
# X1 X2 X3 X4 a b c d
# 1 1 4 7 10 1 4 7 10
# 2 2 5 8 11 2 5 8 11
# 3 3 6 9 12 3 6 9 12
It's similar to setNames() but also compatible with matrices.
Toydata
d <- data.frame(matrix(1:12, 3, 4))
It is possible to do this in tibble
library(tibble)
tibble(resi, !!colnames(dados)[1] :=residuals(m))
The idea is extracting the position of df charactes with a reference of other df, example:
L<-LETTERS[1:25]
A<-c(1:25)
df<-data.frame(L,A)
Compare<-c(LETTERS[sample(1:25, 25)])
df[] <- lapply(df, as.character)
for (i in 1:nrow(df)){
df[i,1]<-which(df[i,1]==Compare)
}
head(df)
L A
1 14 1
2 12 2
3 2 3
This works good but scale very bad, like all for, any ideas with apply, or dplyr?
Thanks
Just use match
Your data (use set.seed when providing data using sample)
df <- data.frame(L = LETTERS[1:25], A = 1:25)
set.seed(1)
Compare <- LETTERS[sample(1:25, 25)]
Solution
df$L <- match(df$L, Compare)
head(df)
# L A
# 1 10 1
# 2 23 2
# 3 12 3
# 4 11 4
# 5 5 5
# 6 21 6
I have a regular expression that parses a bunch of text, an when doing regmatches(myText,myRegex) it returns a list which looks like:
[[1]]
[1] "a=1" "b=3" "a=9" "c=2" "b=4"
...
I'd like to build a data.frame or table - whatever suits best - to finally have something like:
a b c
1 3 2
9 4 ...
Is it possible to make this in a simple fashion? What are your suggestions?
Thanks in advance.
Its not entirely clear what the general case is here but this works on the data provided.
Assuming this input:
x <- c("a=1", "b=3", "a=9", "c=2", "b=4")
split the values by the names producing s and massage into a data.frame:
s <- split(as.numeric(sub(".*=", "", x)), sub("=.*", "", x))
as.data.frame(do.call(cbind, lapply(s, ts)))
giving:
a b c
1 1 3 2
2 9 4 NA
No packages needed.
You can either use base R methods
d1 <- read.table(text=gsub("[[:punct:]]", " " , unlist(lst)))
d2 <- transform(d1, indx=ave(seq_along(V1), V1, FUN=seq_along))
res <- reshape(d2, timevar='V1', idvar='indx', direction='wide')[,-1]
colnames(res) <- gsub(".*\\.", "", colnames(res))
res
# a b c
#1 1 3 2
#3 9 4 2
#6 4 5 NA
#9 9 NA NA
Or using dcast from reshape2 on d2
library(reshape2)
dcast(d2,indx~V1, value.var='V2')[,-1]
# a b c
#1 1 3 2
#2 9 4 2
#3 4 5 NA
#4 9 NA NA
data
lst <- list(c('a=1', 'b=3', 'a=9', 'c=2', 'b=4'),
c('a=4', 'c=2', 'b=5', 'a=9'))
Using rex may make this type of extraction task a little simpler.
x <- c("a=1", "b=3", "a=9", "c=2", "b=4", "a=2")
First extract the names and values from the strings.
library(rex)
matches <- re_matches(x,
rex(
capture(name="name", letter),
"=",
capture(name="value", digit)
))
#> name value
#>1 a 1
#>2 b 3
#>3 a 9
#>4 c 2
#>5 b 4
#>6 a 2
Then tally the groups using split().
groups <- split(as.numeric(matches$value), matches$name)
#>$a
#>[1] 1 9 2
#>
#>$b
#>[1] 3 4
#>
#>$c
#>[1] 2
If we try to convert directly to a data.frame from split() the groups with fewer members will have their members recycled rather than NA, so instead explicitly fill with NA.
largest_group <- max(sapply(groups, length))
#>[1] 3
groups <- lapply(groups, function(group) {
if (length(group) < largest_group) {
group[largest_group] <- NA
}
group
})
#>$a
#>[1] 1 9 2
#>
#>$b
#>[1] 3 4 NA
#>
#>$c
#>[1] 2 NA NA
Finally we can create the data.frame
do.call('data.frame', groups)
#> a b c
#>1 1 3 2
#>2 9 4 NA
#>3 2 NA NA
Here's an approach using tools from my "splitstackshape" package:
library(splitstackshape)
dcast.data.table( ## Makes the long data wide
getanID( ## Adds an ID variable for dcast
## create a single column data.table and split it by the "="
cSplit(as.data.table(unlist(lst)), "V1", "="), "V1_1"),
.id ~ V1_1, value.var = "V1_2")
# .id a b c
# 1: 1 1 3 2
# 2: 2 9 4 2
# 3: 3 4 5 NA
# 4: 4 9 NA NA
This uses #akrun's sample data:
lst <- list(c('a=1', 'b=3', 'a=9', 'c=2', 'b=4'),
c('a=4', 'c=2', 'b=5', 'a=9'))
I have the following data frame and I would like to create a new one that will be like the one below.
ID1 ID2 ID3 ID4
x1_X 0 10 4 7
x2_X 2 12 5 8
x3_X 3 1 3 5
y1_Y 4 13 6 4
y2_Y 5 14 1 9
y3_Y 2 11 1 5
y4_Y 1 1 2 3
z1_Z 1 0 0 5
z2_Z 3 6 7 7
New data frame
ID1 ID2 ID3 ID4
X x3 x2 x2 x2
Y y2 y2 y1 y2
Z z2 z2 z2 z2
Basically the idea is the following:
For each ID I want to find which of the rownames (x1_X,x2_X,x3_X) has the most extreme value and assign this to name X since in the rownames I have subgroups.
My data frame is huge: 1700 columns and 100000 rows.
First we need to split the group and subgroup labels:
grp <- strsplit(row.names(df), "_")
And if performance is an issue, I think data.table is our best choice:
library(data.table)
df$group <- sapply(grp, "[", 2)
subgroup <- sapply(grp, "[", 1)
dt <- data.table(df)
And we now have access to the single line:
result <- dt[,lapply(.SD, function(x) subgroup[.I[which.max(x)]]), by=group]
Which splits the data.table by the character after the underscore (by=group) and then, for every column of the rectangular subset (.SD) we get the index in the sub-rectangle (which.max), and then map it back to the whole data.table (.I), and then extract the relevant subgroup (subgroup).
The data.table package is meant to be quite efficient, though you might want to look into indexing your data.table if you're going to be querying it multiple times.
Your table:
df <- read.table (text= " ID1 ID2 ID3 ID4
x1_X 0 10 4 7
x2_X 2 12 5 8
x3_X 3 1 3 5
y1_Y 4 13 6 4
y2_Y 5 14 1 9
y3_Y 2 11 1 5
y4_Y 1 1 2 3
z1_Z 1 0 0 5
z2_Z 3 6 7 7", header = T)
Split rownames to get groups:
library(plyr)
df_names <- ldply(strsplit (rownames(df), "_"))
colnames(df_names) <- c ("group1", "group2")
df2 <- cbind (df, df_names)
Create new table:
df_new <- data.frame (matrix(nrow = length(unique (df2$group2)),
ncol = ncol(df)))
colnames(df_new) <- colnames(df)
rownames (df_new) <- unique (df_names[["group2"]])
Filling new table with a loop:
for (i in 1:ncol (df_new)) {
for (k in 1:nrow (df_new)) {
col0 <- colnames (df_new)[i]
row0 <- rownames (df_new)[k]
sub0 <- df2 [df2$group2 == row0, c(col0, "group1")]
df_new [k,i] <- sub0 [sub0[1]==max (sub0[1]), 2]
}
}
I have need to name columns of a data.frame with duplicate names. inside of data.frame you can use check.names = FALSE to do the naughty name deed. But if you index this then you lose the naughty names when indexing. I want to retain those names. So beloe is an example and the output I get and I'd like to get:
x <- data.frame(b= 4:6, a =6:8, a =6:8, check.names = FALSE)
x[, -1]
I get:
a a.1
1 6 6
2 7 7
3 8 8
I'd like:
a a
1 6 6
2 7 7
3 8 8
How about this:
subdf <- function(df, ii) {
do.call("data.frame", c(as.list(df)[ii], check.names=FALSE))
}
subdf(x, -1)
# a a
# 1 6 6
# 2 7 7
# 3 8 8
subdf(x, 2:3)
# a a
# 1 6 6
# 2 7 7
# 3 8 8
Here's an ugly solution
> tmp <- data.frame(b=4:6, a=6:8, a=6:8, check.names=FALSE)
> setNames(tmp[, -1], names(tmp)[-1])
a a
1 6 6
2 7 7
3 8 8
Looking at the code for [.data.frame gives this as part of the code
if (anyDuplicated(cols))
names(y) <- make.unique(cols)
and I couldn't see anything in the code that would allow one to skip that check. So it looks like we'll just have to write our own function. It's not very safe though and I'm sure a much better version could be created...
dropCols <- function(x, cols){
nm <- colnames(x)
x <- x[, -cols]
colnames(x) <- nm[-cols]
x
}
x <- data.frame(b= 4:6, a =6:8, a =6:8, check.names = FALSE)
#x[, -1]
dropCols(x, 1)
# a a
#1 6 6
#2 7 7
#3 8 8
per dirks tongue in cheek comment:
safe.data.frame <- function(dat, index) {
colnam <-colnames(dat)[index]
dat2 <- dat[, index]
colnames(dat2) <- colnam
dat2
}
safe.data.frame(x, -1)
I was hoping for something better :)