how to convert the information from row to column in R - r

I have a dateset named "df":
df <- structure(list(outcome = c("cg00000029", "cg00000029", "cg00000029",
"cg00000108", "cg00000108", "cg00000108"),
pval = c("0.63", "0.91", "0.01","0.09", "0.55", "0.23")),
.Names = c("outcome", "pval"), class = "data.frame",row.names = c(NA, -6L))
How could I convert it into dataset named "df1"?
df1 <- structure(list(outcome = c("cg00000029", "cg00000108"),
pval_1 = c("0.63", "0.91"),
pval_2 = c("0.01","0.09"),
pval_3 = c("0.55", "0.23")),
.Names = c("outcome", "pval_1", "pval_2", "pval_3"), class = "data.frame",row.names = c(NA, -2L))
Thank you!

A data.table option using dcast
> dcast(setDT(df), outcome ~ paste0("pval_", rowid(outcome)))
Using 'pval' as value column. Use 'value.var' to override
outcome pval_1 pval_2 pval_3
1: cg00000029 0.63 0.91 0.01
2: cg00000108 0.09 0.55 0.23

Here is a tidyverse approach:
library(dplyr)
library(tidyr)
df %>%
group_by(outcome) %>%
mutate(id = row_number()) %>%
pivot_wider(names_from = id, values_from = pval,
names_glue = "{.value}_{id}")
# A tibble: 2 x 4
# Groups: outcome [2]
outcome pval_1 pval_2 pval_3
<chr> <chr> <chr> <chr>
1 cg00000029 0.63 0.91 0.01
2 cg00000108 0.09 0.55 0.23

Related

How to unnest a data frame containing list of list with varied length?

I was trying to unnest the the following data frame.
df.org <- structure(list(Gene = "ARIH1", Description = "E3 ubiquitin-protein ligase ARIH1",
condition2_cellline = list(c("MCF7", "Jurkat")), condition2_activity = list(
c(40.8284023668639, 13.26973)), condition2_concentration = list(
c("100uM", "100uM")), condition3_cellline = list("Jurkat"),
condition3_activity = list(-4.60251), condition3_concentration = list(
"100uM")), row.names = c(NA, -1L), class = c("tbl_df",
"tbl", "data.frame"))
This is my code:
df.output <- df.ori %>%
unnest(where(is.list), keep_empty = T)
This is what I got:
structure(list(Gene = c("ARIH1", "ARIH1"), Description = c("E3 ubiquitin-protein ligase ARIH1",
"E3 ubiquitin-protein ligase ARIH1"), condition2_cellline = c("MCF7",
"Jurkat"), condition2_activity = c(40.8284023668639, 13.26973
), condition2_concentration = c("100uM", "100uM"), condition3_cellline = c("Jurkat",
"Jurkat"), condition3_activity = c(-4.60251, -4.60251), condition3_concentration = c("100uM",
"100uM")), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-2L))
Is there a way to avoid duplicating those variables with a shorter length? The following output is what I want to get.
df.desired <- structure(list(Gene = c("ARIH1", "ARIH1"), Description = c("E3 ubiquitin-protein ligase ARIH1",
"E3 ubiquitin-protein ligase ARIH1"), condition2_cellline = c("MCF7",
"Jurkat"), condition2_activity = c(40.8284023668639, 13.26973
), condition2_concentration = c("100uM", "100uM"), condition3_cellline = c(NA,
"Jurkat"), condition3_activity = c(NA, -4.60251), condition3_concentration = c(NA,
"100uM")), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-2L))
Thanks so much for any help!
We could also do without reshaping i.e. get the max of the list column lengths in a column, then loop across those list columns, modify the length with the max value and use unnest
library(dplyr)
library(purrr)
library(tidyr)
df.org %>%
mutate(l1 = max(across(where(is.list), lengths)),
across(where(is.list), ~ map(.x, `length<-`, l1)), l1 = NULL) %>%
unnest(where(is.list), keep_empty = TRUE)
-output
# A tibble: 2 × 8
Gene Description condition2_cellline condition2_activity condition2_concentration condition3_cellline condition3_activity condition3_concentration
<chr> <chr> <chr> <dbl> <chr> <chr> <dbl> <chr>
1 ARIH1 E3 ubiquitin-protein ligase ARIH1 MCF7 40.8 100uM Jurkat -4.60 100uM
2 ARIH1 E3 ubiquitin-protein ligase ARIH1 Jurkat 13.3 100uM <NA> NA <NA>
Here is suggestion how it could work.
We pivot_longer all listed columns.
apply the the function to create lists of same length
pivot back and unnest.
library(dplyr)
library(tidyr)
df.org %>%
pivot_longer(cols = starts_with("condition")) %>%
mutate(value = lapply(value, `length<-`, max(lengths(value)))) %>%
pivot_wider(names_from = name, values_from = value) %>%
unnest(cols = c(condition2_cellline, condition2_activity, condition2_concentration,
condition3_cellline, condition3_activity, condition3_concentration))
Gene Description condition2_cell~ condition2_acti~ condition2_conc~ condition3_cell~ condition3_acti~ condition3_conc~
<chr> <chr> <chr> <dbl> <chr> <chr> <dbl> <chr>
1 ARIH1 E3 ubiquitin-prot~ MCF7 40.8 100uM Jurkat -4.60 100uM
2 ARIH1 E3 ubiquitin-prot~ Jurkat 13.3 100uM NA NA NA
>

Aggregate columns based on categories given by another dataframe

I have a dataframe where each column has some vector of data. I want to apply the mean columnwise, but filtered by groups which are given by a second dataframe. That is, each column belongs to a group and this information is in the second dataframe.
Here is some example dataset: df is the dataframe with the data vectors, df_category contains the category for each column.
df=structure(list(x1 = c(0.461302090047301, -1.19974381763812, -0.888258056235799,
0.300889698419314, 0.836911163114131, 0.0540388337324712), x2 = c(1.33736696170763,
-0.687026295689823, 1.12205295626651, -0.848925266014684, 1.16092168555067,
0.591202293337843), x3 = c(-0.279052669225263, -0.780435476613128,
-0.852870619718068, -0.708611614262357, -0.761659405740852, 0.487033696695474
), x4 = c(-0.222767493777229, 1.50328295132467, 0.934670132217215,
1.37678188537077, 0.343280062984192, 1.23279081824003), x5 = c(-1.08074586121729,
0.208120194894818, -0.52245832008453, 0.944618465137011, 0.749834485631317,
-0.81118414509141)), class = "data.frame", row.names = c(NA,
-6L))
df_category=structure(list(Col_name = structure(1:5, .Label = c("x1", "x2",
"x3", "x4", "x5"), class = "factor"), Category = structure(c(1L,
1L, 2L, 2L, 2L), .Label = c("A", "B"), class = "factor")), class = "data.frame", row.names = c(NA,
-5L))
The result I want is this one:
df_result=structure(list(mean_A = c(0.899334525877468, -0.943385056663974,
0.116897450015357, -0.274017783797685, 0.998916424332403, 0.322620563535157
), mean_B = c(-0.527522008073261, 0.310322556535454, -0.146886269195128,
0.537596245415141, 0.110485047624885, 0.302880123281364)), class = "data.frame", row.names = c(NA,
-6L))
in Base R:
a <- with(df_category, setNames(Category, Col_name))[names(df)[col(df)]]
tapply(unlist(df), list(row(df), a), mean)
A B
1 0.8993345 -0.5275220
2 -0.9433851 0.3103226
3 0.1168975 -0.1468863
4 -0.2740178 0.5375962
5 0.9989164 0.1104850
6 0.3226206 0.3028801
Another option:
sapply(with(df_category, split.default(df[Col_name], Category)), rowMeans)
A B
[1,] 0.8993345 -0.5275220
[2,] -0.9433851 0.3103226
[3,] 0.1168975 -0.1468863
[4,] -0.2740178 0.5375962
[5,] 0.9989164 0.1104850
[6,] 0.3226206 0.3028801
We can use tidyverse to reshape the data values, merge the category data, and compute means for groups "A" and "B":
library(tidyverse)
df_result <- df %>%
mutate(idx = row_number()) %>%
pivot_longer(-idx) %>%
inner_join(df_category, c(name = 'Col_name')) %>%
group_by(Category, idx) %>%
summarize(mean = mean(value)) %>%
pivot_wider(names_from = Category, values_from = mean, names_prefix = 'mean_') %>%
select(-idx)
mean_A mean_B
<dbl> <dbl>
1 0.899 -0.528
2 -0.943 0.310
3 0.117 -0.147
4 -0.274 0.538
5 0.999 0.110
6 0.323 0.303

Extracting elements from a list to create a matrix

I have a list of countries with lists inside each one of them.
Just to give you an example of a list object for one country with lists for two countries (df_DOTS):
df_DOTS <- list(BR = structure(list(`#FREQ` = "M", `#REF_AREA` = "AU", `#INDICATOR` = "TXG_FOB_USD",
`#COUNTERPART_AREA` = "BR", `#UNIT_MULT` = "6", `#TIME_FORMAT` = "P1M",
Obs = list(structure(list(`#TIME_PERIOD` = c("2019-07", "2019-08",
"2019-09"), `#OBS_VALUE` = c("55.687747", "36.076581", "57.764474"
)), class = "data.frame", row.names = c(NA, 3L)))), row.names = 2L, class = "data.frame"),
US = structure(list(`#FREQ` = "M", `#REF_AREA` = "AU", `#INDICATOR` = "TXG_FOB_USD",
`#COUNTERPART_AREA` = "US", `#UNIT_MULT` = "6", `#TIME_FORMAT` = "P1M",
Obs = list(structure(list(`#TIME_PERIOD` = c("2019-07",
"2019-08", "2019-09"), `#OBS_VALUE` = c("876.025841",
"872.02118", "787.272851")), class = "data.frame", row.names = c(NA,
3L)))), row.names = 1L, class = "data.frame"))
I can reach the matrix (matrix_DOTS) I am looking for using these lines of code:
library(dplyr)
library(rlist)
library(magrittr)
BR <- df_DOTS[["BR"]][["Obs"]] %>%
list.select(.$`#OBS_VALUE`) %>%
unlist() %>%
sapply(function(x) as.numeric(as.character(x))) %>%
mean()
US <- df_DOTS[["US"]][["Obs"]] %>%
list.select(.$`#OBS_VALUE`) %>%
unlist() %>%
sapply(function(x) as.numeric(as.character(x))) %>%
mean()
matrix_DOTS <- matrix(c(BR, US), nrow = 1, dimnames = list(c("AU"), c("BR", "US")))
Since I have a list of several countries with lists of other several countries inside them, I am looking for a more practical way of achieving matrix_DOTS. Any help is highly appreciated!
PS: This is the dput for the final matrix in this example:
matrix_DOTS <- structure(c(49.842934, 845.106624), .Dim = 1:2, .Dimnames = list(
"AU", c("BR", "US")))
EDIT
This is the procedure to obtain df_DOTS:
library(IMFData)
databaseID <- "DOT"
startdate = "2019-07-01"
enddate = "2019-09-01"
checkquery = FALSE
queryfilter <- list(CL_FREQ = "M", CL_AREA_DOT = "AU",
CL_INDICATOR_DOT = "TXG_FOB_USD",
CL_COUNTERPART_AREA_DOT = c("BR", "US"))
df_DOTS <- CompactDataMethod(databaseID, queryfilter, startdate, enddate, checkquery) %>%
split(.$`#COUNTERPART_AREA`)
Just add tidy = TRUE to the CompactDataMethod call:
library(IMFData)
databaseID <- "DOT"
startdate = "2019-07-01"
enddate = "2019-09-01"
checkquery = FALSE
queryfilter <- list(CL_FREQ = "M", CL_AREA_DOT = "AU",
CL_INDICATOR_DOT = "TXG_FOB_USD",
CL_COUNTERPART_AREA_DOT = c("BR", "US"))
df_DOTS <- CompactDataMethod(databaseID,
queryfilter,
startdate,
enddate,
checkquery,
tidy = TRUE)
df_DOTS
#TIME_PERIOD #OBS_VALUE #FREQ #REF_AREA #INDICATOR #COUNTERPART_AREA #UNIT_MULT #TIME_FORMAT
1 2019-07 876.025841 M AU TXG_FOB_USD US 6 P1M
2 2019-08 872.02118 M AU TXG_FOB_USD US 6 P1M
3 2019-09 787.272851 M AU TXG_FOB_USD US 6 P1M
4 2019-07 55.687747 M AU TXG_FOB_USD BR 6 P1M
5 2019-08 36.076581 M AU TXG_FOB_USD BR 6 P1M
6 2019-09 57.764474 M AU TXG_FOB_USD BR 6 P1M
you just need one group_by(#COUNTERPART_AREA) %>% summarise(mean = mean(#OBS_VALUE)):
library(tidyverse)
df_DOTS %>%
group_by(`#COUNTERPART_AREA`, `#REF_AREA`) %>%
summarise(mean = mean(as.numeric(`#OBS_VALUE`))) %>%
spread( `#COUNTERPART_AREA`, mean)
#output
`#REF_AREA` BR US
<chr> <dbl> <dbl>
1 AU 49.8 845.
Or if you insist on a matrix
df_DOTS %>%
group_by(`#COUNTERPART_AREA`, `#REF_AREA`) %>%
summarise(mean = mean(as.numeric(`#OBS_VALUE`))) %>%
spread( `#COUNTERPART_AREA`, mean) %>%
column_to_rownames("#REF_AREA") %>%
as.matrix
#output
BR US
AU 49.84293 845.1066
From the input data, we could loop over with map, pluck the elements that is needed, convert to numeric, get the mean, and convert to a two column tibble with enframe
library(purrr)
library(tidyr)
map(df_DOTS, ~ .x %>%
pluck("Obs", 1, "#OBS_VALUE") %>%
as.numeric %>%
mean) %>%
enframe %>%
unnest(c(value))
# A tibble: 2 x 2
# name value
# <chr> <dbl>
#1 BR 49.8
#2 US 845.
Another option would be like this:
tmp <- df_DOTS %>%
as_tibble() %>%
summarise(across(everything(), ~mean(as.numeric(.x$Obs[[1]]$`#OBS_VALUE`))))
tmp
# # A tibble: 1 x 2
# BR US
# <dbl> <dbl>
# 1 49.8 845.

Divide by last row in mutate in Tidyverse

So this is a relatively simple problem, I have a dataset as below
df <- structure(list(term = c("(Intercept)", "overall_quality", "overall_costs",
"wwpf"), estimate = c(0.388607224137536, 0.456477162621961, 0.485612564501229,
NA), std.error = c(0.499812263278414, 0.0987819420575201, 0.108042289289401,
NA), statistic = c(0.777506381273137, 4.62105879995918, 4.49465267438447,
NA), p.value = c(0.440597919486169, 0.0000279867005591494, 0.0000426773877613654,
NA), average = c(NA, 8.09615384615385, 7.86538461538461, 7.90384615384615
), Elasticity = c(NA, 3.69570933584318, 3.81952959386543, NA)), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -4L))
I am trying to use below
df %>% mutate(Elasticity= average*estimate/average[nrow(df)])
Expected output: https://ibb.co/42ptLXx
basically, divide by last row value & since I am trying to incorporate this in function, I need the method to be dynamic & not hard coded value.
Please help !
We can use n() to return the index of last row for subsetting the value of that column
library(dplyr)
df %>%
mutate(Elasticity= average*estimate/average[n()])
If we need a function (using rlang_0.4.0), we can make use {{..}} for evaluation
f1 <- function(dat, col1, col2) {
dat %>%
mutate(Elasticity = {{col1}} * {{col2}}/{{col1}}[n()])
}
f1(df, average, estimate)
# A tibble: 4 x 7
# term estimate std.error statistic p.value average Elasticity
# <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 (Intercept) 0.389 0.500 0.778 0.441 NA NA
#2 overall_quality 0.456 0.0988 4.62 0.0000280 8.10 0.468
#3 overall_costs 0.486 0.108 4.49 0.0000427 7.87 0.483
#4 wwpf NA NA NA NA 7.90 NA

Error : Error in mutate_impl(.data, dots) : Column `three_month` must be length 1 (the group size), not 3766742 dplyr R

My weekly dataset has different state_id associated with different cities.Value1 and value2 need to be aggregated to monthly level and then quarterly level.So am trying below code:
library(dplyr)
df <- dataset %>%
group_by(state_id,city_id) %>%
group_by(three_month = round_date(weekly_dt, "quarter")) %>%
summarise_at(vars(starts_with('value')), mean)
But its popping out this error
Error in mutate_impl(.data, dots) :
Column `three_month` must be length 1 (the group size), not 3766742
Note : All cities don't have same level of weekly data that is the reason I used group_by first.
Can someone help me in R.
EDIT :my dat
structure(list(city_id = c("B02", "B02", "B02",
"B02", "B02", "B02"), state_id = c(609L, 609L,
609L, 609L, 609L, 609L), weekly_dt = structure(c(17601,
17545, 17447, 17727, 17510, 17664), class = "Date"), value1 = c(0.194669883125,
0.35, 0.35, 0.124875972916667, 0.35, 0.140909438125), value2 = c(0.203018924883721,
0.35, 0.35, 0.35, 0.35, 0.35)), class = c("data.table", "data.frame"
), row.names = c(NA, -6L), .internal.selfref = <pointer: 0x0000000004541ef0>)
The mutate function adds additional columns to the data frame, which can then be referenced in a group_by. floor_date instead of round_date may be better here because all dates within the quarter will be placed in the same quarter.
library(dplyr)
library(lubridate)
df <- dataset %>%
mutate(three_month = floor_date(weekly_dt, "quarter")) %>%
group_by(state_id, city_id, three_month) %>%
summarise_at(vars(starts_with('value')), mean)
# A tibble: 4 x 5
# Groups: state_id, city_id [?]
# state_id city_id three_month value1 value2
# <int> <chr> <date> <dbl> <dbl>
# 1 609 B02 2017-10-01 0.350 0.350
# 2 609 B02 2018-01-01 0.272 0.277
# 3 609 B02 2018-04-01 0.141 0.350
# 4 609 B02 2018-07-01 0.125 0.350

Resources