This is a sample of my dataset.
library(tidyr)
library(dplyr)
resource <- c("good","good","bad","bad","good","good","bad","bad","good","good","bad","bad","good","good","bad","bad")
fertilizer <- c("none", "nitrogen","none","nitrogen","none", "nitrogen","none","nitrogen","none", "nitrogen","none","nitrogen","none", "nitrogen","none","nitrogen")
t1 <- sample(1:20, 16)
t2 <- sample(1:20, 16)
t3 <- sample(1:20, 16)
t4 <- sample(1:20, 16)
t5 <- sample(1:20, 16)
t6 <- sample(10:100, 16)
t7 <- sample(10:100, 16)
t8 <- sample(10:100, 16)
t9 <- sample(10:100, 16)
t10 <- sample(10:100, 16)
replicates <- c(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16)
data <- data.frame(resource, fertilizer,replicates, t1,t2,t3,t4,t5,t6,t7,t8,t9,t10)
data$resource <- as.factor(data$resource)
data$fertilizer <- as.factor(data$fertilizer)
Where t0,t1,t2..etc are time points. I need to average adjacent time points (non-overlapping) across eg. (t1,t2), (t3,t4)..and the new column headings need to have the average of the times, so that the columns read as t1.5,t3.5,...etc.
Thus in the end I need to have only 5 columns reading t1.5, t3.5,t5.5, t7.5,t9.5
Is there anyway this can be achieved using dplyr function, or any other function in R?
Edited for OP's modified request:
If you put everything in a tidy format, you can take advantage of the lag/lead functions to average adjacent rows.
library(stringr)
library(forcats)
data %>%
gather(key = time, value = value, -replicates, -resource, -fertilizer) %>%
mutate(index = as.integer(str_extract(time, "[0-9]+"))) %>%
arrange(replicates, index) %>%
group_by(resource, fertilizer, replicates) %>%
mutate(mid_value = (value + lead(value))/2,
mid_index = (index + lead(index))/2,
mid_time = str_c("t",mid_index)) %>%
ungroup %>%
filter(!is.na(mid_value), index %% 2 == 1) %>%
select(replicates, resource, fertilizer, matches("mid")) %>%
rename(value = mid_value, time = mid_time, index = mid_index) %>%
arrange(index) %>%
mutate(time = as_factor(time)) %>%
select(-index) %>%
spread(key = time, value = value) %>%
arrange(replicates)
Solution using only base R: You need to somehow find the columns you want to calculate the average for. You can do this by searching the column names for the t + "somenumber" pattern. After that, create a sequence of sequences, corresponding to the column numbers of df you want to calculate the mean for.
relevant_cols <- grep("[0-9]{1,2}", names(df))
start <- min(relevant_cols)
end <- max(relevant_cols)
cols <- split(start:end, rep(1:5, each=2))
If you look at cols, you'll see that it is list of five, each element resembling a combination of columns you want to average. This smells like a use-case for sapply():
newdf <- sapply(cols, function(x) rowMeans(df[x]) )
colnames(newdf) <- paste0("t", seq(1, diff(range(relevant_cols)), 2) + 0.5)
Edit: I seem to have misunderstood what you want to maintain and what not. You can just cbind() (parts of) the old df to newdf:
cbind(df, newdf)
cbind(df[, -relevant_cols], newdf) # This is what you want. I think..
Here ya go:
transmute(data,
t1.5 = (t1 + t2) / 2,
t3.5 = (t3 + t4) / 2,
t5.5 = (t5 + t6) / 2,
t7.5 = (t7 + t8) / 2,
t9.5 = (t9 + t10) / 2)
Related
I have a dataframe of coefficients and a dataframe of parameters. Example:
coefficients <- data.frame(a = c(1, 2, 3),
b_w = c(3, 4, 5),
b_x = c(5, 6, 7))
parameters <- data.frame(w = c(0, 1),
x = c(2, 3))
I want to generate a dataframe by multiplying these two data frames for all combinations of coefficients and parameters, in which each column is numbered based on the relevant row number in the parameters dataframe. Example:
output <- data.frame(
params1 = c(coefficients$a[1] + coefficients$b_w[1]*parameters$w[1] + coefficients$b_x[1]*parameters$x[1],
coefficients$a[2] + coefficients$b_w[2]*parameters$w[1] + coefficients$b_x[2]*parameters$x[1],
coefficients$a[3] + coefficients$b_w[3]*parameters$w[1] + coefficients$b_x[3]*parameters$x[1]),
params2 = c(coefficients$a[1] + coefficients$b_w[1]*parameters$w[2] + coefficients$b_x[1]*parameters$x[2],
coefficients$a[2] + coefficients$b_w[2]*parameters$w[2] + coefficients$b_x[2]*parameters$x[2],
coefficients$a[3] + coefficients$b_w[3]*parameters$w[2] + coefficients$b_x[3]*parameters$x[2]
)
)
It seems to me that this must be possible using purrr, but I cannot figure out how to get started.
You can use matrix multiplication here:
coefs <- as.matrix(coefficients)
params <- as.matrix(parameters)
out <- coefs %*% t(cbind(1, params))
colnames(out) <- paste0("params", 1:2)
out
# params1 params2
#[1,] 11 19
#[2,] 14 24
#[3,] 17 29
#markus provides an elegant matrix-based solution, but (as per the question title) I am looking for a purrr-based solution. I still haven't found one, but I do have a solution that remains within the tidyverse:
library(tidyverse)
parameters %>%
mutate(params = row_number()) %>%
crossing(coefficients) %>%
mutate(output = a + b_w*w + b_x*x) %>%
select(params, output) %>%
pivot_wider(names_from = params,
names_prefix = "params",
values_from = output,
values_fn = list) %>%
unnest(cols = everything())
I have a dataframe with two colums (one contains a timestamp and the other a specific value).
The gaps between the timestamps are not equi-spaced but are approximately the same length (1000 ms +/- 15%).
Every now and then a timestamp + value is missing resulting in a timestamp-difference of approx 2*(previous difference).
Is there a way to find those gaps and just add an NA row to later fill it with imputated values?
Thanks!
How about this
library(tidyverse)
df <- tibble(id = as.character(1:20), t = seq(1000, 20000, by = 1000) + runif(20, -15, 15), x = rnorm(20)) %>%
filter(!(id %in% c(3, 7, 9)))
df
df <- df %>% mutate(delta_t = t - lag(t),
mean_t = (t + lag(t))/2) %>%
filter(delta_t > 1015 | delta_t < 985 ) %>%
mutate(t = mean_t, x = NA) %>%
select(id, t, x) %>%
bind_rows(df) %>%
arrange(t)
df
The id column is purely so that you can easily see that I've taken out three values in constructing the dummy data
I want to use dplyr to calculate a field using it's previous value. A for loop would do the job but I want to calculate by different groups of st. I understand mutate (lag or ave) can't use an unitialized field.
df <- data.frame(st = rep(c('a','b', 'c'), each = 5),
z = rep(c(10,14,12), each = 5),
day = rep(1:5, 3),
GAI = rep(0:4, 3),
surfT = sample(1:15))
df %>%
group_by(st) %>%
mutate(soilT = lag(soilT, order_by = day) + (surfT - lag(soilT,
order_by = day))*0.24*exp(-z*0.017)*exp(-0.15*GAI))
or
df %>%
group_by(st) %>%
mutate(soilT = ave(soilT, c(st), FUN=function(x) c(0, soilT + (surfT - soilT)))
*0.24*exp(-z*0.017)*exp(-0.15*GAI))
how can a simple for loop be caculated in dplyr by group of st:
df$soilT <- 0
for (i in 2:dim(df)[1]){
df$soilT[i]=df$soilT[i-1] + (df$surfT[i] - df$soilT[i-1])
*0.24*exp(-z[i]*0.017)*exp(-0.15*GAI[i])
}
We can use accumulate from purrr to get the output of previous row as an input to current row.
library(dplyr)
result <- df %>%
group_by(st) %>%
mutate(soilT = purrr::accumulate(2:n(),
~.x + (surfT[.y] - .x)*0.24*exp(-z[.y]*0.017)*exp(-0.15*GAI[.y]),
.init = 0))
I have a large data set with X and Y points. I want to animate it, so I want to remove the points that are fixed locations.I would like to remove rows where the same X and Y is repeated more than n times.
So far I did this, is there a more elegant solution? Thanks!
uniques <- unique(data[c("Lat","Long")])
uniques$values = row.names(uniques)
uniques2 <- inner_join(data,uniques,by=c("Lat","Long"))
reps <- data.frame(unclass(rle(uniques2$values)))
delete <- as.character(reps$values[(reps$lengths)>10])
data2 <- uniques2[! uniques2$values %in% delete),]
Tidyverse-way would be
data2 <- data %>%
group_by(Long, Lat) %>%
filter(n() <= 10) %>%
ungroup()
Assuming you want to keep the first of the rows where x and y start repeating, you could try the following approach with the dplyr package:
library(dplyr)
# Example data
df <- data.frame(
x = c(rep(1, 5), 2:6, rep(7, 5)),
y = c(rep(9, 5), 2:6, rep(8, 5))
)
# Cut-off value
n <- 3
# Remove unwanted rows
new_df <- df %>%
mutate(same_as_prev = x == lag(x) & y == lag(y)) %>%
group_by(x, y, same_as_prev) %>%
mutate(consec_count = n()) %>%
filter(consec_count <= n & same_as_prev) %>%
ungroup()
Using data.table I will try this one line solution:
library(data.table)
data < as.data.table(data)[, count:=.N, by=.(Lat,Long)][count<n][,count:=NULL]
Best!
I'm Brazilian, sorry about my english!
I would like to know if there is an function implemented in some R package to filter first "n" rows and group the remaining into an "Other" row and summarise the column.
Here is below an example of what I want:
library(tidyverse)
library(plotly)
library(scales)
data("lakers")
x = bind_rows(
lakers %>% count(player) %>% arrange(-n) %>% head(10),
lakers %>% count(player) %>% arrange(-n) %>% slice(11:n()) %>%
summarise(player = "Others", n = sum(n))) %>%
filter(!player == "") %>%
mutate(
player = factor(player, levels = rev(.$player)))
ggplot(x, aes(x=player, y=n))+
geom_col(fill = "DodgerBlue1", col = "DodgerBlue3")+
coord_flip()+
geom_text(aes(y=n, label = comma(n)),hjust = -.2)+
scale_y_continuous(limits = c(0, max( x$n*1.1 )))+
theme_minimal()
I need to create an ggplot like that. So I have a big query using dplyr and I don't want to repeat the query every time.
I would like some function like:
head.other(x, rows = 20, fun = sum, name = "Others")
Here is a function that I think will give you what you need:
library(tibble)
library(dplyr)
df <- data.frame(col1 = rnorm(10), col2 = rnorm(10)) # your data frame
n <- 6 # top n rows to keep
myfun <- function(df, n) {
# seperate keep rows and those to aggregate
preserve.df <- df[1:n, ]
summarise.df <- df[(n+1):nrow(df), ]
# create new df in required format
new.df <- bind_rows(preserve.df, sapply(summarise.df, sum))
# add a column to identify the rows and return
rownames(new.df) <- c(paste0("r", 1:n), "Other")
rownames_to_column(new.df)
}
myfun(df, 6)