df <- data.frame('Dev' = 1:12,
'GWP' = seq(10,120,10),
'2012' = 1:12,
'Inc' = seq(10,120,10),
'GWP2' = c(seq(10,100,10),NA,NA),
'2013'= 1:12,
'Inc2' = c(seq(10,100,10),NA,NA),
'GWP3' = c(seq(10,80,10),NA,NA,NA,NA),
'2014'= 1:12,
'Inc3' = c(seq(10,80,10),NA,NA,NA,NA))
head(df)
result_df <- data.frame('Dev' = rep(1:12,3),
'GWP' = c(seq(10,120,10),
c(seq(10,100,10),NA,NA),
c(seq(10,80,10),NA,NA,NA,NA)),
'YEAR' = c(rep(2012,12),
rep(2013,12),
rep(2014,12)),
'Inc' = c(seq(10,120,10),
c(seq(10,100,10),NA,NA),
c(seq(10,80,10),NA,NA,NA,NA)))
head(result_df)
The above is my data structure.
I'm trying to make the df to look like result_df. I'm assuming using the library reshape2 somehow would do the trick but I'm having troubles getting it to come out as expected:
x <- melt(df,id=c("Dev"))
x$value <- ifelse(x$variable == 'X2012',2012,
ifelse(x$variable == 'X2013',2013,
ifelse(x$variable == 'X2014',2014,x$value)))
x$variable <- ifelse(x$variable %in% c('GWP','GWP2','GWP3'),'GWP',
ifelse(x$variable %in% c('Inc','Inc2','Inc3'), 'Inc',
ifelse(x$variable %in% c('X2012','X2013','X2014'),"Year",
x$variable)))
The problem is that the "year" column in my actual data can go for 20-30 years and I want to avoid using multiple ifelse statements to map them up. Is there a way to do this?
The data needs some pre-processing before getting the expected output. Using tidyverse one possible way is
library(tidyverse)
df %>%
gather(key, value, -Dev) %>%
mutate(col = case_when(str_detect(key, "^GWP") ~ "GWP",
str_detect(key, "^X") ~ "Year",
str_detect(key, "^Inc") ~ "Inc"),
value = ifelse(col == "Year", sub("^X", "", key), value)) %>%
select(-key) %>%
group_by(col) %>%
mutate(Dev1 = row_number()) %>%
spread(col, value) %>%
select(-Dev1)
# A tibble: 36 x 4
# Dev GWP Inc Year
# <int> <chr> <chr> <chr>
# 1 1 10 10 2012
# 2 1 10 10 2013
# 3 1 10 10 2014
# 4 2 20 20 2012
# 5 2 20 20 2013
# 6 2 20 20 2014
# 7 3 30 30 2012
# 8 3 30 30 2013
# 9 3 30 30 2014
#10 4 40 40 2012
# … with 26 more rows
I found that this works for the first part:
apply(matrix(c(2012:2014)), 1, function(y) x$value[x$variable == paste("X", y, sep = "")] <<- y )
create a 1 dim matrix to iterate over using apply.
create a function to replace the values found through masking.
Note the use of the <<-, it assigns the respective values to the x scoped one level above that of the function defined in the apply.
Note it applies the function to the variable x and returns the values used in the replacement.
For the second part:
x$variable[x$variable %in% c('GWP', 'GWP2', 'GWP3')] <- "GWP"
x$variable[x$variable %in% c('Inc', 'Inc2', 'Inc3')] <- "Inc"
Since the variable column is type factor and Year is not a level:
x <- transform(x, variable = as.character(variable))
x$variable[x$variable %in% c('X2012', 'X2013', 'X2014')] <- "Year"
x <- transform(x, variable = as.factor(variable))
Related
I have a problem that sounds easy, however, I could not find a solution in R. I would like to shift values according to the first year of the release. I mean the first column represents the years of the release and the columns are years when the device is broken (values are numbers of broken devices).
This is a solution in Python:
def f(x):
shifted = np.argmin((x.index.astype(int)< x.name[0]))
return x.shift(-shifted)
df = df.set_index(['Delivery Year', 'Freq']).apply(f, axis=1)
df.columns = [f'Year.{i + 1}' for i in range(len(df.columns))]
df = df.reset_index()
df
I would like to have it in R too.
# TEST
data <- data.frame(
`Delivery Year` = c('1976','1977','1978','1979'),
`Freq` = c(120,100,80,60),
`Year.1976` = c(10,NA,NA,NA),
`Year.1977` = c(5,3,NA,NA),
`Year.1978` = c(10,NA,8,NA),
`Year.1979` = c(13,10,5,14)
)
data
# DESIRED
data <- data.frame(
`Delivery Year` = c('1976','1977','1978','1979'),
`Freq` = c(120,100,80,60),
`Year.1` = c(10,3,8,14),
`Year.2` = c(5,NA,5,NA),
`Year.3` = c(10,10,NA,NA),
`Year.4` = c(13,NA,NA,NA)
)
data
In addition, would it be also possible to transform the number of broken devices into the percentage of Freq column?
Thank you
Using tidyverse
data %>%
pivot_longer(!c(Delivery.Year, Freq)) %>%
separate(name, c("Lab", "Year")) %>%
select(-Lab) %>%
mutate_all(as.numeric) %>%
filter(Year >= Delivery.Year) %>%
group_by(Delivery.Year, Freq) %>%
mutate(ind = paste0("Year.", row_number()),
per = value/Freq) %>%
ungroup() %>%
pivot_wider(id_cols = c(Delivery.Year, Freq), names_from = ind, values_from = c(value, per))
I pivoted it into long form to begin with and separated the original column names Year.1976, Year.1977, etc. to just get the years from the columns and dropped the Year piece of it. Then I converted all columns to numeric to allow for mathematical operations like filtering for when Year >= Delivery.Year. I then created a column to get the titles you did request Year.1, Year.2, etc. and calculated the percent. Then I pivot_wider to get it in the format you requested. One thing to note is that I was unclear if you wanted both the original values and the percent or just the percent. If you only want the percent then values_from = per should do it for you.
library(dplyr)
f <- function(df) {
years <- paste0("Year.",sort(as.vector(na.omit(as.integer(stringr::str_extract(colnames(df), "\\d+"))))))
df1 <- df %>% select(years)
df2 <- df %>% select(-years)
val <- c()
firstyear <- years[1]
for (k in 1:nrow(df1) ) {
vec <- as.numeric(as.vector(df1[k,]))
val[k] <- (as.numeric(suppressWarnings(na.omit(vec))))[1]
}
df1[firstyear] <- val
colnames(df1) <- c(paste0("Year.",seq(1:ncol(df1))))
df <- cbind(df2,df1)
print(df)
}
> f(data)
Delivery.Year Freq Year.1 Year.2 Year.3 Year.4
1 1976 120 10 5 10 13
2 1977 100 3 3 NA 10
3 1978 80 8 NA 8 5
4 1979 60 14 NA NA 14
I have a table with columns
[Time, var1, var2, var3, var4...varN]
I need to calculate mean/SE per Time for each var1, var2...var n , and I want to do this programmatically for all variables, rather than 1 at a time which would involve a lot of copy-pasting.
Section 8.2.3 here https://tidyeval.tidyverse.org/dplyr.html is close to what I want but my below code:
x <- as.data.frame(matrix(nrow = 2, ncol = 3))
x[1,1] = 1
x[1,2] = 2
x[1,3] = 3
x[2,1] =4
x[2,2] = 5
x[2,3] = 6
names(x)[1] <- "time"
names(x)[2] <- "var1"
names(x)[3] <- "var2"
grouped_mean3 <- function(.data, ...) {
print(.data)
summary_vars <- enquos(...)
print(summary_vars)
summary_vars <- purrr::map(summary_vars, function(var) {
expr(mean(!!var, na.rm = TRUE))
})
print(summary_vars)
.data %>%
group_by(time)
summarise(!!!summary_vars) # Unquote-splice the list
}
grouped_mean3(x, var("var1"), var("var2"))
Yields
Error in !summary_vars : invalid argument type
And the original cause is "Must group by variables found in .data." and it finds a column that isn't in the dummy "x" that I generated for the purposes of testing. I have no idea what's happening, sadly.
How do I actually extract the mean from the new summary_vars and add it to the .data table? summary_vars becomes something like
[[1]]
mean(~var1, na.rm = TRUE)
[[2]]
mean(~var2, na.rm = TRUE)
Which seems close, but needs evaluation. How do I evaluate this? !!! wasn't working.
For what it's worth, I tried plugging the example in dplyr into this R engine https://rdrr.io/cran/dplyr/man/starwars.html and it didn't work either.
Help?
End goal would be a table along the lines of
[Time, var1mean, var2mean, var3mean, var4mean...]
Try this :
library(dplyr)
grouped_mean3 <- function(.data, ...) {
vars <- c(...)
.data %>%
group_by(time) %>%
summarise(across(all_of(vars), mean))
}
grouped_mean3(x, 'var1')
# time var1mean
# <dbl> <dbl>
#1 1 2
#2 4 5
grouped_mean3(x, 'var1', 'var2')
# time var1mean var2mean
# <dbl> <dbl> <dbl>
#1 1 2 3
#2 4 5 6
Perhaps this is what you are looking for?
x %>%
group_by(time) %>%
summarise_at(vars(starts_with('var')), ~mean(.,na.rm=T)) %>%
rename_at(vars(starts_with('var')),funs(paste(.,"mean"))) %>%
merge(x)
With your data (from your question) following is the output:
time var1mean var2mean var1 var2
1 1 2 3 2 3
2 4 5 6 5 6
I find purrr SUPER useful for making results tables for a bunch of different variables. I was wondering if there was a way for the unnest() function (or otherwise) to expand a high order variable into blanks, rather than just repeating.
For example, with this code:
library(tidyverse)
data <- data.frame(
group1 = sample(c('dog','cat', 'gecko'), 100, replace = T),
group2 = sample(c('hot dog', 'not hotdog', 'other'), 100, replace = T)
)
my_freq <- function(var){
result <- as.data.frame(table(data[[var]]))
colnames(result) <- c('level', 'n')
return(result)
}
the_table <- data.frame(var = c('group1', 'group2'))
the_table <- the_table %>%
mutate(
result = map(var, my_freq)
) %>%
unnest(result)
Instead of the resulting table looking like:
It would look like this:
I guess this would be a multi-level index in python, but not sure how to accomplish in r.
Extending rmagno's solution to other high order variables
...%>%
mutate_at(
.vars = vars(high_order_vars),
.funs = function(x) ifelse(duplicated(.[['var']]), NA, x)
)
Not sure what you mean by blank (in this example I am going with NA). The critical line is:
mutate(var = if_else(!duplicated(var), var, NA_integer_))
Minimal working example:
library(tidyverse)
data <- data.frame(
group1 = sample(c('dog','cat', 'gecko'), 100, replace = T),
group2 = sample(c('hot dog', 'not hotdog', 'other'), 100, replace = T)
)
my_freq <- function(var){
result <- as.data.frame(table(data[[var]]))
colnames(result) <- c('level', 'n')
return(result)
}
the_table <- data.frame(var = c('group1', 'group2'))
the_table <- the_table %>%
mutate(
result = map(var, my_freq)
) %>%
unnest(result) %>%
mutate(var = if_else(!duplicated(var), var, NA_integer_))
the_table
#> # A tibble: 6 x 3
#> var level n
#> <fct> <fct> <int>
#> 1 group1 cat 38
#> 2 <NA> dog 38
#> 3 <NA> gecko 24
#> 4 group2 hot dog 36
#> 5 <NA> not hotdog 34
#> 6 <NA> other 30
Created on 2020-02-29 by the reprex package (v0.3.0)
You could just use an lapply to get a list I call tb of the two tables. Then create a matrix with names(tb) in the first row and the rest blanks and convert it to a data frame. Finally Map assigns the desired names applying cbind on the columns of both data frames consecutively.
tb <- lapply(data, function(x) setNames(as.data.frame(table(x)), c("level", "n")))
res <- do.call(rbind,
Map(cbind,
var=data.frame(
matrix(c(names(tb), rep("", (el(lapply(tb, nrow)) - 1)*2)),
ncol=2, byrow=TRUE)),
tb))
res
# var level n
# X1.1 group1 cat 31
# X1.2 dog 26
# X1.3 gecko 43
# X2.1 group2 hot dog 35
# X2.2 not hotdog 37
# X2.3 other 28
This question already has answers here:
Performing dplyr mutate on subset of columns
(5 answers)
Closed 5 years ago.
What I'm trying to do here is bring in dplyr::select() semantics into a function supplied to dplyr::mutate(). Below is a minimal example.
dat <- tibble(class = rep(c("A", "B"), each = 10),
x = sample(100, 20),
y = sample(100, 20),
z = sample(100, 20))
.reorder_rows <- function(...) {
x <- list(...)
y <- as.matrix(do.call("cbind", x))
h <- hclust(dist(y))
return(h$order)
}
dat %>%
group_by(class) %>%
mutate(h_order = .reorder_rows(x, y, z))
## class x y z h_order
## <chr> <int> <int> <int> <int>
## 1 A 85 17 5 1
## 2 A 67 24 35 5
## ...
## 18 B 76 7 94 9
## 19 B 65 39 85 8
## 20 B 49 11 100 10
##
## Note: function applied across each group, A and B
What I would like to do is something along the lines of:
dat %>%
group_by(class) %>%
mutate(h_order = .reorder_rows(-class))
The reason this is important is that when dat has many more variables, I need to be able to exclude the grouping/specific variables from the function's calculation.
I'm not sure how this would be implemented, but somehow using select semantics within the .reorder_rows function might be one way to tackle this problem.
For this particular approach, you should probably nest and unnest (using tidyr) by class rather than grouping by it:
library(tidyr)
library(purrr)
dat %>%
nest(-class) %>%
mutate(h_order = map(data, .reorder_rows)) %>%
unnest()
Incidentally, notice that while this works with your function you could also write a shorter version that takes the data frame directly:
.reorder_rows <- function(x) {
h <- hclust(dist(as.matrix(x)))
return(h$order)
}
I would like to create a dataframe with confidence intervals for proportions as a final result. I have introduced a variable (tp in my example) as a cut off value to calculate the proportions for. I would like to use the dplyr package to produce the final dataframe.
Below is a simplified example:
library(dplyr)
my_names <- c("A","B")
dt <- data.frame(
Z = sample(my_names,100,replace = TRUE),
X = sample(1:10, replace = TRUE),
Y = sample(c(0,1), 100, replace = TRUE)
)
my.df <- dt%>%
mutate(tp = (X >8)* 1) %>% #multiply by one to convert into numeric
group_by(Z, tp) %>%
summarise(n = n()) %>%
mutate(prop.tp= n/sum(n)) %>%
mutate(SE.tp = sqrt((prop.tp*(1-prop.tp))/n))%>%
mutate(Lower_limit = prop.tp-1.96 * SE.tp)%>%
mutate(Upper_limit = prop.tp+1.96 * SE.tp)
output:
Source: local data frame [4 x 7]
Groups: Z
Z tp n prop.tp SE.tp Lower_limit Upper_limit
1 A 0 33 0.6346154 0.08382498 0.4703184 0.7989123
2 A 1 19 0.3653846 0.11047236 0.1488588 0.5819104
3 B 0 27 0.5625000 0.09547033 0.3753782 0.7496218
4 B 1 21 0.4375000 0.10825318 0.2253238 0.6496762
However, I would like to calculate the Standard error and the CI:s using the total sample for the groups in column Z, not the splitted sample by the categorical variable tp.
So the total sample for A in my example should be n = 33 +19.
Any ideas?
Not quite sure I get which group you want to compare with which here, but at any rate you have two grouping variables tp = X > 8 and Z.
If you want to compare the rows with X > 8 and Z == "A" to all rows with X > 8 you can do it like this
merge(
dt %>%
group_by(X > 8) %>%
summarize(n.X = n()),
dt %>%
group_by(X > 8, Z) %>%
summarise(n.XZ = n()),
by = "X > 8"
) %>%
mutate(prop.XZ = n.XZ/n.X) %>%
mutate(SE = sqrt((prop.XZ*(1-prop.XZ))/n.X))%>%
mutate(Lower_limit = prop.XZ-1.96 * SE) %>%
mutate(Upper_limit = prop.XZ+1.96 * SE)
X > 8 n.X Z n.XZ prop.XZ SE Lower_limit Upper_limit
1 FALSE 70 A 37 0.5285714 0.05966378 0.4116304 0.6455124
2 FALSE 70 B 33 0.4714286 0.05966378 0.3544876 0.5883696
3 TRUE 30 A 16 0.5333333 0.09108401 0.3548087 0.7118580
4 TRUE 30 B 14 0.4666667 0.09108401 0.2881420 0.6451913
If you want to turn the problem around and compare X > 8 and Z == "A" to all rows with Z == "A" you can do it like this
merge(
dt %>%
group_by(Z) %>%
summarize(n.Z = n()),
dt %>%
group_by(X > 8, Z) %>%
summarise(n.XZ = n()),
by = "Z"
) %>%
mutate(prop.XZ = n.XZ/n.Z) %>%
mutate(SE = sqrt((prop.XZ*(1-prop.XZ))/n.Z))%>%
mutate(Lower_limit = prop.XZ-1.96 * SE) %>%
mutate(Upper_limit = prop.XZ+1.96 * SE)
Z n.Z X > 8 n.XZ prop.XZ SE Lower_limit Upper_limit
1 A 53 FALSE 37 0.6981132 0.06305900 0.5745176 0.8217088
2 A 53 TRUE 16 0.3018868 0.06305900 0.1782912 0.4254824
3 B 47 FALSE 33 0.7021277 0.06670743 0.5713811 0.8328742
4 B 47 TRUE 14 0.2978723 0.06670743 0.1671258 0.4286189
It is a bit messy having to merge two separate groupings, but I don't know if it is possible to ungroup and re-group in the same statement. I am suprised though how difficult it seems to be to use groupings on two different levels (if you can call it that) and hope someone else can come up with a better solution.