I have a dataframe:
a <- c(1:5)
b <- c(5:9)
c <- c(7:11)
min <- c(2,7,4,5,3)
max <- c(5,9,12,8,7)
df1 <- data.frame(a,b,c,min,max)
df1
How can I set row limits, so the minimum and maximum values of each row a,b,c is set by the row values in columns: min and max?
Thank you for your help.
I find this easier if you think of the data frame as a list of columns:
mins <- lapply(df1[1:3], function(x) pmax(x, df1$min))
maxs <- lapply(mins, function(x) pmin(x, df1$max))
Then you can rebuild the data.frame:
df2 <- do.call(cbind, maxs)
f <- function(X){
X[X < df1$min] <- df1$min[X < df1$min]
X[X > df1$max] <- df1$max[X > df1$max]
X
}
sapply(df1[,1:3] , f)
Try something like:
## Function that checks the limits of each column
limit <- function(x, mn, mx) {
pmax(pmin(x, mx), mn)
}
## Then build your data.frame
df2 <- data.frame(a = limit(a, min, max), b = limit(b, min, max), c = limit(c, min, max), min, max)
Using apply is another option on an existing data.frame
df3 <- apply(df1[ ,1:3], 2, limit, mn = min, mx = max)
I hope it helps,
Alex
Pretty much this: EDIT I apologize: it should be "pmin" for "parallel minimum" and "pmax"
for(j in 1:3 ) df1[,j]<-pmin(df1[,j],df1[,5])
for(j in 1:3 ) df1[,j<-pmax(df1[,j],df1[,4])
PS you're probably better off with a matrix
Related
I have a data frame of values and want to count the number of values in each column that are greater than or equal to a series of thresholds. The ultimate goal is to produce a graph that looks like panels B and D in this figure
I have figured out a way to do this with a for loop but would like to avoid a loop if possible:
data <- as.data.frame(attitude)
max <- 100
counts <- data.frame(t(c(score = 1, colSums(data >= 1))))
for (x in 2:max) {
counts <- rbind(counts, c(score = x, colSums(data >= x)))
}
I tried adapting the code from this question (Count the number of values per column above a range of thresholds in R), but it gives an error and does not produce the expected result:
as.data.frame() %>%
lapply(
function(z) table(findInterval(z, 0:max, rightmost.closed = TRUE))
) %>%
do.call(cbind, .) %>%
as.data.frame()
Is there a way to do this without a loop? Thanks.
You can do this with sapply/lapply :
data <- as.data.frame(attitude)
num <- seq_len(100)
result <- data.frame(score = num,
do.call(rbind, lapply(num, function(x) colSums(data >= x))))
How can I remove (for example) 3 rows every 10 rows?
For example if I have a dataframe with 100 rows at the end I need I dataframe with 70 rows (with missing the first,the second,the third,the eleventh,the twelfth,the thirteenth and so on).
Using a toy dataframe with 100 lines, try this:
df <- data.frame(x = 1:100, y = 1:100)
rem <- as.vector(sapply(1:3, function(i) seq(i, nrow(df), 10)))
df[-rem, ]
We can use outer to create a sequence of the rows to remove.
result <- df[-c(outer(seq(1, nrow(df), 10), 0:2, `+`)), ]
We could do this with rep in a vectorized way
i1 <- seq(1, nrow(df), 10)
out <- df[-(rep(0:2, each = length(i1)) + i1),]
data
df <- data.frame(x = 1:100, y = 1:100)
I want to get the total number of NA that missmatch between two dataframes.
I have found the way to get this for two vectors as follows:
compareNA <- function(v1,v2) {
same <- (v1 == v2) | (is.na(v1) & is.na(v2))
same[is.na(same)] <- FALSE
n <- 0
for (i in 1:length(same))
if (same[i] == "FALSE"){
n <- n+1
}
return(n)
}
Lets say I have vector aand bwhen comparing them I got as a result 2
a <- c(1,2,NA, 4,5,6,NA,8)
b <- c(NA,2,NA, 4,NA,6,NA,8)
h <- compareNA(a,b)
h
[1] 2
My question is: how to apply this function for dataframes instead of vectors?
Having as an example this datafames:
a2 <- c(1,2,NA,NA,NA,6,NA,8)
b2 <- c(1,NA,NA,4,NA,6,NA,NA)
df1 <- data.frame(a,b)
df2 <- data.frame(a2,b2)
what i expect as a result is 5, since this are the total number of NAs that appear in df2 that are not in df1. Any suggestion how to make this work?
Here's a second thought.
xy1 <- data.frame(a = c(NA, 2, 3), b = rnorm(3))
xy2 <- data.frame(a = c(NA, 2, 4), b = rnorm(3))
com <- intersect(colnames(xy1), colnames(xy2))
sum(xy1[, com] == xy2[, com], na.rm = TRUE)
If you don't want to worry about column names (but you should), you can make sure the columns align perfectly. In that case, intersect step is redundant.
sum(xy1 == xy2, na.rm = TRUE)
A third way (assuming dimensions of df1 & df2 are same):
sum(sapply(1:ncol(df1), function(x) compareNA(df1[,x], df2[,x])))
# 5
It would be easier to force both dataframes to have the same column names and compare column by column when those have the same name. You can then simply use a loop over columns and increment a running total by applying your function.
compareNA.df <- function(df1, df2) {
total <- 0
common_columns <- intersect(colnames(df1), colnames(df2))
for (col in common_columns) {
total <- total + compareNA(df1[[col]], df2[[col]])
}
return(total)
}
colnames(df2) <- c("a", "b")
compareNA.df(df1, df2)
I have a dataframe df. For each column I want to add another column indicating whether the value is inside or outside my simple "outlier detection thresholds" by writing TRUE (= outlier) or FALSE (= no outlier).
Here's the code:
df <- read.csv("<FILE>", header=TRUE, sep=";")
column_names <- colnames(df[,-1]) # first column is actually row name
for(name in column_names) {
med <- median(df[[name]], na.rm = TRUE)
std <- sd(df[[name]], na.rm = TRUE)
max <- med + 3 * std
min <- med - 3 * std
newcol <- paste(name, "outlier", sep="_") # create new column name
df <- within(df, newcol <- ifelse(name < max & name > min,"FALSE","TRUE"))
}
Instead of adding a new column for every existing one, just one column named "newcol" is added. How do I access the actual value of the variable newcol in this case? Alread tried get(newcol) and [[newcol]].
Thank you so much for your help!
EDIT:
Solution looks like this
df <- read.csv("<FILE>", header=TRUE, sep=";")
column_names <- colnames(df[,-1]) # first column is actually row name
for(name in column_names) {
med <- median(df[[name]], na.rm = TRUE)
std <- sd(df[[name]], na.rm = TRUE)
max <- med + 3 * std
min <- med - 3 * std
newcol <- paste(name, "outlier", sep="_")
df[[newcol]] <- with(df, ifelse(df[[name]] < max & df[[name]] > min,"FALSE","TRUE"))
}
Your last line should read:
df[[newcol]] <- with(df, ifelse(...))
The <- operator assumes that newcol is the actual name of the column, not a variable that contains this name.
This is an approach using data.table
require(data.table)
outlier <- function(x) {
med <- median(x, na.rm = TRUE)
std <- sd(x, na.rm = TRUE)
max <- med + 3 * std
min <- med - 3 * std
return(!(x < max & x > min))
}
# df <- fread("<FILE>")
df <- data.table(x = rt(10, 5), y = rt(10, 5))
df[3, x := 100]
df[7, y := 100]
df[, paste(names(df), "outlier", sep="_") := lapply(.SD, outlier)]
df
You could assign everything at once:
is_outlier <- function(x) {
med <- median(x, na.rm = TRUE)
std <- sd(x, na.rm = TRUE)
max <- med + 3 * std
min <- med - 3 * std
!(x < max & x > min)
}
column_names <- names(df)[-1]
column_names_outlier <- paste(column_names, "outlier", sep="_")
df[column_names_outlier] <- lapply(df[column_names], is_outlier)
I am trying to calculate the average of each column in a dataframe using the top k values. I have a solution, however, it is slow and hamfisted. Here is what I came up with:
predictMat <- matrix(0,nrow = length(colnames(DT)),ncol = 1)
k <- 100
itemSummary <- for(i in colnames(DT)) {
u <- data.frame(DT[,i , drop = F])
sortU1 <- data.frame(u[order(u[,i], decreasing = T),, drop = F])
u1Neighbors <- data.matrix(sortU1[1:k,, drop = F])
predictMat[i] <- mean(u1Neighbors, na.rm = T)
}
You can do this in one line using the apply function:
# Sample data frame
set.seed(144)
DT <- matrix(rnorm(1000), nrow=100)
k <- 10
# Compute average of 10 largest values in each column
apply(DT, 2, function(x) mean(tail(sort(x), k)))
# [1] 1.721765 1.658917 1.630231 1.558280 1.606363 1.526322 1.810814 1.678135
# [9] 1.541305 1.621984
could do this with back-to-back apply functions
set.seed(100)
x <- as.data.frame ( matrix(runif(5000,0,10), nrow=1000,ncol=5) )
x1<- apply(x,2,sort,decreasing=T)
apply(x1[1:100,],2,mean)
V1 V2 V3 V4 V5
9.548000 9.572912 9.422325 9.547370 9.462894
edit: looks like I was a few seconds behind in my answer!