I have a very wide dataset (2000+ variables) that I'm trying to make tidy but I am getting stuck trying to pull out a value from the variable name. If I have a variable that is "E1Time1_Date" I'd like to reshape it to be three variables: E=1, Time=1, and Date=the original date value.
Is this even possible? I've tried to use gather() but am guessing there is a step I need to do first that I am missing. Thank you for your help!
And here is the sample dataset if anyone wanted to make the magic happen:
structure(list(ID = c(123, 225), UnrelatedV1 = c("Unrelated1",
"Unrelated1"), UnrelatedV2 = c("Unrelated2", "Unrelated2"), E1T1_Date = structure(c(1506816000,
1513296000), class = c("POSIXct", "POSIXt"), tzone = "UTC"),
E1T1_v1 = c(10, 20), E1T1_v2 = c(20, 20), E1T1_v3 = c(30,
20), E1T1_v4 = c(40, 20), E1T2_Date = structure(c(1512086400,
NA), class = c("POSIXct", "POSIXt"), tzone = "UTC"), E1T2_v1 = c(10,
NA), E1T2_v2 = c(10, NA), E1T2_v3 = c(10, NA), E1T2_v4 = c(10,
NA), E2T1_Date = structure(c(1522540800, 1525132800), class = c("POSIXct",
"POSIXt"), tzone = "UTC"), E2T1_v1 = c(10, 20), E2T1_v2 = c(20,
20), E2T1_v3 = c(10, 20), E2T1_v4 = c(10, 20), E2T2_Date = structure(c(1533859200,
NA), class = c("POSIXct", "POSIXt"), tzone = "UTC"), E2T2_v1 = c(10,
NA), E2T2_v2 = c(30, NA), E2T2_v3 = c(10, NA), E2T2_v4 = c(10,
NA)), .Names = c("ID", "UnrelatedV1", "UnrelatedV2", "E1T1_Date",
"E1T1_v1", "E1T1_v2", "E1T1_v3", "E1T1_v4", "E1T2_Date", "E1T2_v1",
"E1T2_v2", "E1T2_v3", "E1T2_v4", "E2T1_Date", "E2T1_v1", "E2T1_v2",
"E2T1_v3", "E2T1_v4", "E2T2_Date", "E2T2_v1", "E2T2_v2", "E2T2_v3",
"E2T2_v4"), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-2L))
Looks like you've got a mix of numeric and date values, which will make gathering a little tricky. One way to do it is to convert dates to numeric for now, then you can change them back once you get to your final format. This should get you started.
library(tidyverse)
-2L))
data %>%
#convert dates to numeric so we can gather them in the same column
mutate_if(is.POSIXct, as.integer) %>%
gather(-ID, -contains("Unrelated"), key = variable, value = value) %>%
#add an underscore between E and T to make separating them easier
mutate(loc = gregexpr("T", variable)[[1]],
variable = paste0(substr(variable, 1, loc - 1), "_",
substr(variable, loc, nchar(variable)))) %>%
select(-loc) %>%
#separate into three distinct columns
separate(variable, into = c("E", "T", "vDate"), sep = "_")
# A tibble: 40 x 7
ID UnrelatedV1 UnrelatedV2 E T vDate value
<dbl> <chr> <chr> <chr> <chr> <chr> <dbl>
1 123 Unrelated1 Unrelated2 E1 T1 Date 1506816000
2 225 Unrelated1 Unrelated2 E1 T1 Date 1513296000
3 123 Unrelated1 Unrelated2 E1 T1 v1 10
4 225 Unrelated1 Unrelated2 E1 T1 v1 20
5 123 Unrelated1 Unrelated2 E1 T1 v2 20
6 225 Unrelated1 Unrelated2 E1 T1 v2 20
7 123 Unrelated1 Unrelated2 E1 T1 v3 30
8 225 Unrelated1 Unrelated2 E1 T1 v3 20
9 123 Unrelated1 Unrelated2 E1 T1 v4 40
10 225 Unrelated1 Unrelated2 E1 T1 v4 20
Related
I just want to know how to find and replace empty columns into na for a whole data frame
sample data
structure(list(id = structure(8.44425875736171e-318, class = "integer64"),
project_id = 11L, experiment_id = 85L,
gene = "", si = -0.381, pi = ""
on1 = "CC",
on2 = "GG",
on3 = "aa",
created_at = structure(1618862091.85075, class = c("POSIXct",
"POSIXt"), tzone = "UTC")), row.names = c(NA, -1L), class = c("data.table",
"data.frame"), .internal.selfref = <pointer: 0x000001ba09da3590>)
i have a solution to check for a particular column but i dont how to apply this for whole dataframe
data$gene <- ifelse((is.na(data$gene) == TRUE),'NA',data$gene)
You could use lapply with gsub to replace each empty cell with NA like this:
df <- structure(list(id = structure(8.44425875736171e-318, class = "integer64"),
project_id = 11L, experiment_id = 85L,
gene = "", si = -0.381, pi = "",
on1 = "CC",
on2 = "GG",
on3 = "aa",
created_at = structure(1618862091.85075, class = c("POSIXct",
"POSIXt"), tzone = "UTC")), row.names = c(NA, -1L), class = c("data.table",
"data.frame"))
df
#> id project_id experiment_id gene si pi on1 on2 on3
#> 1 8.444259e-318 11 85 -0.381 CC GG aa
#> created_at
#> 1 2021-04-19 19:54:51
df[] <- lapply(df, function(x) gsub("^$", NA, x))
df
#> id project_id experiment_id gene si pi on1 on2 on3
#> 1 8.44425875736171e-318 11 85 <NA> -0.381 <NA> CC GG aa
#> created_at
#> 1 2021-04-19 19:54:51
Created on 2022-11-02 with reprex v2.0.2
You can also use dplyr with mutate and across
library(dplyr)
library(tidyr)
df <- structure(list(id = structure(8.44425875736171e-318, class = "integer64"),
project_id = 11L, experiment_id = 85L,
gene = "", si = -0.381, pi = "",
on1 = "CC",
on2 = "GG",
on3 = "aa",
created_at = structure(1618862091.85075, class = c("POSIXct",
"POSIXt"), tzone = "UTC")), row.names = c(NA, -1L), class = c("data.table",
"data.frame"))
df %>%
mutate(dplyr::across(where(is.character), ~ gsub("^$", NA, .x)))
Note that I also attempted to use replace_na, however this only works on values that are actually NA.
test %>%
mutate(dplyr::across(where(is.character), ~ replace_na(.x, "NA")))
"" is not considered
NA is considered NA
Keep that in mind while you are performing your analysis.
Using na_if
library(data.table)
library(dplyr)
df[, lapply(.SD, \(x) if(is.character(x)) na_if(x, "") else x)]
-output
id project_id experiment_id gene si pi on1 on2 on3 created_at
<i64> <int> <int> <char> <num> <char> <char> <char> <char> <POSc>
1: 1709137 11 85 <NA> -0.381 <NA> CC GG aa 2021-04-19 19:54:51
I have a few large dataframes in RStudio, that have this structure:
Original data structure
structure(list(CHROM = c("scaffold1000|size223437", "scaffold1000|size223437",
"scaffold1000|size223437", "scaffold1000|size223437"), POS = c(666,
1332, 3445, 4336), REF = c("A", "TA", "CTTGA", "GCTA"), RO = c(20,
14, 9, 25), ALT_1 = c("GAT", "TGC", "AGC", "T"), ALT_2 = c("CAG",
"TGA", "CGC", NA), ALT_3 = c("G", NA, "TGA", NA), ALT_4 = c("AGT",
NA, NA, NA), AO_1 = c(13, 4, 67, 120), AO_2 = c(12, 5, 34, NA
), AO_3 = c(6, NA, 18, NA), AO_4 = c(101, NA, NA, NA), AOF_1 = c(8.55263157894737,
17.3913043478261, 52.34375, 82.7586206896552), AOF_2 = c(7.89473684210526,
21.7391304347826, 26.5625, NA), AOF_3 = c(3.94736842105263, NA,
14.0625, NA), AOF_4 = c(66.4473684210526, NA, NA, NA)), class = "data.frame", row.names = c(NA,
-4L))
But for an analysis I need it to look like this:
Desired output
structure(list(CHROM = c("scaffold1000|size223437", "scaffold1000|size223437",
"scaffold1000|size223437", "scaffold1000|size223437"), POS = c(666,
1332, 3445, 4336), REF = c("A", "TA", "CTTGA", "GCTA"), RO = c(20,
14, 9, 25), ALT_1 = c("AGT", "TGA", "AGC", "T"), ALT_2 = c("CAG",
"TGC", "CGC", NA), ALT_3 = c("G", NA, "TGA", NA), ALT_4 = c("GAT",
NA, NA, NA), AO_1 = c(101, 5, 67, 120), AO_2 = c(12, 4, 34, NA
), AO_3 = c(6, NA, 18, NA), AO_4 = c(13, NA, NA, NA), AOF_1 = c(66.4473684210526,
21.7391304347826, 52.34375, 82.7586206896552), AOF_2 = c(7.89473684210526,
17.3913043478261, 26.5625, NA), AOF_3 = c(3.94736842105263, NA,
14.0625, NA), AOF_4 = c(8.55263157894737, NA, NA, NA)), class = "data.frame", row.names = c(NA,
-4L))
So what I would like to do is to rearrange the content of a row in a way, that the columns ALT_1, ALT_2, ALT_3, ALT_4 are alphabetically sorted, but at the same time I also need to rearrange the corresponding columns of AO and AOF, so that the values still match.
(The value of AO_1 should still match with the sequence that was in ALT_1.
So if ALT_1 becomes ALT_2 in the sorted dataframe, AO_1 should also become AO_2)
What I tried so far, but didn't work:
Pasting the values of ALT_1, AO_1, AOF_1 all in one field, so I have them together with
if (is.na(X[i,6]) == FALSE) {
X[i,6] <- paste(X[i,6],X[i,10],X[i,14],sep=" ")
}
}
And then I wanted to extract every row as a vector to sort the values and put it back in the dataframe, but I didn't manage to do this.
So the question would be how I can order the dataframe to get the desired output?
(I need to apply this to 32 dataframes with each having >100.000 values)
Here is dplyr solution. Took me some time and I needed some help pivot_wider dissolves arrange:
library(dplyr)
library(tidyr)
df1 %>%
mutate(id = row_number()) %>%
unite("conc1", c(ALT_1, AO_1, AOF_1), sep = "_") %>%
unite("conc2", c(ALT_2, AO_2, AOF_2), sep = "_") %>%
unite("conc3", c(ALT_3, AO_3, AOF_3), sep = "_") %>%
unite("conc4", c(ALT_4, AO_4, AOF_4), sep = "_") %>%
pivot_longer(
starts_with("conc")
) %>%
mutate(value = ifelse(value=="NA_NA_NA", NA_character_, value)) %>%
group_by(id) %>%
mutate(value = sort(value, na.last = TRUE)) %>%
ungroup() %>%
pivot_wider(
names_from = name,
values_from = value,
values_fill = "0"
) %>%
separate(conc1, c("ALT_1", "AO_1", "AOF_1"), sep = "_") %>%
separate(conc2, c("ALT_2", "AO_2", "AOF_2"), sep = "_") %>%
separate(conc3, c("ALT_3", "AO_3", "AOF_3"), sep = "_") %>%
separate(conc4, c("ALT_4", "AO_4", "AOF_4"), sep = "_") %>%
select(CHROM, POS, REF, RO, starts_with("ALT"), starts_with("AO_"), starts_with("AOF_")) %>%
type.convert(as.is=TRUE)
CHROM POS REF RO ALT_1 ALT_2 ALT_3 ALT_4 AO_1 AO_2 AO_3 AO_4 AOF_1 AOF_2 AOF_3 AOF_4
<chr> <int> <chr> <int> <chr> <chr> <chr> <chr> <int> <int> <int> <int> <dbl> <dbl> <dbl> <dbl>
1 scaffold1000|size223437 666 A 20 AGT CAG G GAT 101 12 6 13 66.4 7.89 3.95 8.55
2 scaffold1000|size223437 1332 TA 14 TGA TGC NA NA 5 4 NA NA 21.7 17.4 NA NA
3 scaffold1000|size223437 3445 CTTGA 9 AGC CGC TGA NA 67 34 18 NA 52.3 26.6 14.1 NA
4 scaffold1000|size223437 4336 GCTA 25 T NA NA NA 120 NA NA NA 82.8 NA NA NA
here is a data.table approach
library(data.table)
# Set to data.table format
setDT(mydata)
# Melt to long format
DT.melt <- melt(mydata, measure.vars = patterns(ALT = "^ALT_", AO = "^AO_", AOF = "^AOF_"))
# order by groups, na's at the end
setorderv(DT.melt, cols = c("CHROM", "POS", "ALT"), na.last = TRUE)
# cast to wide again, use rowid() for numbering
dcast(DT.melt, CHROM + POS + REF + RO ~ rowid(REF), value.var = list("ALT", "AO", "AOF"))
# CHROM POS REF RO ALT_1 ALT_2 ALT_3 ALT_4 AO_1 AO_2 AO_3 AO_4 AOF_1 AOF_2 AOF_3 AOF_4
# 1: scaffold1000|size223437 666 A 20 AGT CAG G GAT 101 12 6 13 66.44737 7.894737 3.947368 8.552632
# 2: scaffold1000|size223437 1332 TA 14 TGA TGC <NA> <NA> 5 4 NA NA 21.73913 17.391304 NA NA
# 3: scaffold1000|size223437 3445 CTTGA 9 AGC CGC TGA <NA> 67 34 18 NA 52.34375 26.562500 14.062500 NA
# 4: scaffold1000|size223437 4336 GCTA 25 T <NA> <NA> <NA> 120 NA NA NA 82.75862 NA NA NA
help <- data.frame(
id = c(100, 100, 101, 102, 102),
q1 = c(NA, 1, NA, NA, 3),
q2 = c(1, NA, 2, NA, NA),
q3 = c(NA, 1, NA, 4, NA),
q4 = c(NA, NA, 4, NA, 5),
group = c("a", "b", "c", "a", "c"))
help$group <- as.character(help$group)
I am trying to pivot longer so dataset looks like this:
id score group
100 NA a
100 1 b
100 NA c
...
But I get an error with the numeric values of q1-q4 and the character string group.
pivot_longer(help, !id, names_to = "score",
values_to = "group", values_ptypes = list(group = 'character'))
Error: Can't convert <double> to <character>.
How can I pivot longer but also preserve the group variable (where there is several missing data for the q1-4 there is a match for every id and group)?
library(tidyr)
output <- pivot_longer(help, -c(id, group), names_to = "question",
values_to = "score") %>%
dplyr::select(-question) %>%
dplyr::arrange(id, group)
Output
head(output)
# A tibble: 6 × 3
id group score
<dbl> <chr> <dbl>
1 100 a NA
2 100 a 1
3 100 a NA
4 100 a NA
5 100 b 1
6 100 b NA
My objective is to find the amount of time that has elapsed in between the first time a user used a service and the second time a user used the service. Users who have only used the service once should be excluded.
For example, I have a sample dataset test:
> test
start_time user_id
1 2018-01-17 22:10:21 1
2 2018-01-17 22:10:08 2
3 2018-01-18 07:02:36 3
4 2018-01-18 07:24:18 4
5 2018-01-18 15:08:45 2
6 2018-01-18 15:26:57 1
7 2018-01-18 15:37:47 1
8 2018-01-18 20:12:43 3
9 2018-01-18 20:01:08 2
10 2018-01-18 22:42:02 2
I am able to go one by one with difftime:
output$time_lapse[1] <- abs(difftime(test$start_time[1], test$start_time[6]))
But this will take a long time on a large dataset. How could one iterate with data.table or dplyr ?
The output would be in hours as below, from the test dataset above:
> output
user_id time_lapse
1 1 17.27667
2 2 16.97694
3 3 13.16861
4 4 NA
Any suggestions would be appreciated! Here is the sample data:
> dput(test)
structure(list(start_time = structure(c(1516255821, 1516255808,
1516287756, 1516289058, 1516316925, 1516318017, 1516318667, 1516335163,
1516334468, 1516344122), class = c("POSIXct", "POSIXt"), tzone = ""),
user_id = c(1, 2, 3, 4, 2, 1, 1, 3, 2, 2)), .Names = c("start_time",
"user_id"), row.names = c(NA, 10L), class = "data.frame")
Here is data.table method
library(data.table)
test <- structure(list(start_time = structure(c(1516255821, 1516255808,
1516287756, 1516289058, 1516316925, 1516318017, 1516318667, 1516335163,
1516334468, 1516344122), class = c("POSIXct", "POSIXt"), tzone = ""),
user_id = c(1, 2, 3, 4, 2, 1, 1, 3, 2, 2)), .Names = c("start_time",
"user_id"), row.names = c(NA, 10L), class = "data.frame")
setDT(test)
test[, .(time_lapse = difftime(start_time[2], start_time[1])), by = user_id]
# user_id time_lapse
# 1: 1 17.27667 hours
# 2: 2 16.97694 hours
# 3: 3 13.16861 hours
# 4: 4 NA hours
Another possible solution with dplyr
df = structure(list(start_time = structure(c(1516255821, 1516255808,
1516287756, 1516289058, 1516316925, 1516318017, 1516318667, 1516335163,
1516334468, 1516344122), class = c("POSIXct", "POSIXt"), tzone = ""),
user_id = c(1, 2, 3, 4, 2, 1, 1, 3, 2, 2)), .Names = c("start_time",
"user_id"), row.names = c(NA, 10L), class = "data.frame")
library(dplyr)
df %>%
group_by(user_id) %>%
mutate(dif = difftime(start_time,lag(start_time),units = "hours")) %>%
filter(row_number()==2)
Result:
# A tibble: 3 x 3
# Groups: user_id [3]
start_time user_id dif
<dttm> <dbl> <time>
1 2018-01-18 21:08:45 2 16.97694 hours
2 2018-01-18 21:26:57 1 17.27667 hours
3 2018-01-19 02:12:43 3 13.16861 hours
I have imported data from matlab and have a large list (over 1000 list elements) from which I created the following sample dataset data with only two list elements.
data <- structure(list(TEST.DATA.1.1 = structure(list(ID = c(2, 2, 2), YEAR = c(1990, 1991, 1992), DATA.1 = c(10, 20, 30), DATA.NAME = structure(c(1L, 1L, 1L), class = "factor", .Label = "Test"), Remarks = c(1990, 1991, 1992)), .Names = c("ID", "YEAR", "DATA.1", "DATA.NAME", "Remarks"), row.names = c(NA, -3L), class = "data.frame"), TEST.DATA.2.1 = structure(list(ID = c(4, 4), YEAR = c(2000, 2001), DATA.1 = c(55, 60), DATA.2 = c(0, 2), DATA.3 = c(4, 6), DATA.NAME.structure..n1....Dim...c.1L..1L.. = structure(c(1L,1L), class = "factor", .Label = "n1"), DATA.NAME.structure..n2....Dim...c.1L..1L.. = structure(c(1L, 1L), class = "factor", .Label = "n2"), DATA.NAME.structure..n3....Dim...c.1L..1L.. = structure(c(1L,1L), class = "factor", .Label = "n3"), Remarks = c(2000,2001)), .Names = c("ID", "YEAR", "DATA.1", "DATA.2", "DATA.3", "DATA.NAME.structure..n1....Dim...c.1L..1L..", "DATA.NAME.structure..n2....Dim...c.1L..1L..", "DATA.NAME.structure..n3....Dim...c.1L..1L..", "Remarks"), row.names = c(NA, -2L), class = "data.frame")), .Names = c("TEST.DATA.1.1", "TEST.DATA.2.1"))
data
$TEST.DATA.1.1
ID YEAR DATA.1 DATA.NAME Remarks
1 2 1990 10 Test 1990
2 2 1991 20 Test 1991
3 2 1992 30 Test 1992
$TEST.DATA.2.1
ID YEAR DATA.1 DATA.2 DATA.3 DATA.NAME.structure..n1....Dim...c.1L..1L.. DATA.NAME.structure..n2....Dim...c.1L..1L.. DATA.NAME.structure..n3....Dim...c.1L..1L.. Remarks
1 4 2000 55 0 4 n1 n2 n3 2000
2 4 2001 60 2 6 n1 n2 n3 2001
I am looking for a way how I could rename the data columns with the name from the column(s) DATA.NAME. Sometimes there are multiple data columns and respective names such as in the second list element and sometimes there is only one such as in the first element. I am looking for a way to do the renaming for a large list (> 1000 list elements) and then drop the DATA.NAME columns such as in data_new.
data_new
$TEST.DATA.1.1
ID YEAR Test Remarks
1 2 1990 10 1990
2 2 1991 20 1991
3 2 1992 30 1992
$TEST.DATA.2.1
ID YEAR n1 n2 n3 Remarks
1 4 2000 55 0 4 2000
2 4 2001 60 2 6 2001
Here's a base R approach:
for (i in seq_along(data)) {
namecis <- grep('^DATA\\.NAME',names(data[[i]]));
datacis <- grep('^DATA\\.\\d+',names(data[[i]]));
names(data[[i]])[datacis] <- as.character(unlist(data[[i]][1,namecis]));
data[[i]][namecis] <- list(NULL);
};
data;
## $TEST.DATA.1.1
## ID YEAR Test Remarks
## 1 2 1990 10 1990
## 2 2 1991 20 1991
## 3 2 1992 30 1992
##
## $TEST.DATA.2.1
## ID YEAR n1 n2 n3 Remarks
## 1 4 2000 55 0 4 2000
## 2 4 2001 60 2 6 2001
Solution using data.table package.
require(data.table)
data <- structure(list(TEST.DATA.1.1 = structure(list(ID = c(2, 2, 2), YEAR = c(1990, 1991, 1992), DATA.1 = c(10, 20, 30), DATA.NAME = structure(c(1L, 1L, 1L), class = "factor", .Label = "Test"), Remarks = c(1990, 1991, 1992)), .Names = c("ID", "YEAR", "DATA.1", "DATA.NAME", "Remarks"), row.names = c(NA, -3L), class = "data.frame"), TEST.DATA.2.1 = structure(list(ID = c(4, 4), YEAR = c(2000, 2001), DATA.1 = c(55, 60), DATA.2 = c(0, 2), DATA.3 = c(4, 6), DATA.NAME.structure..n1....Dim...c.1L..1L.. = structure(c(1L,1L), class = "factor", .Label = "n1"), DATA.NAME.structure..n2....Dim...c.1L..1L.. = structure(c(1L, 1L), class = "factor", .Label = "n2"), DATA.NAME.structure..n3....Dim...c.1L..1L.. = structure(c(1L,1L), class = "factor", .Label = "n3"), Remarks = c(2000,2001)), .Names = c("ID", "YEAR", "DATA.1", "DATA.2", "DATA.3", "DATA.NAME.structure..n1....Dim...c.1L..1L..", "DATA.NAME.structure..n2....Dim...c.1L..1L..", "DATA.NAME.structure..n3....Dim...c.1L..1L..", "Remarks"), row.names = c(NA, -2L), class = "data.frame")), .Names = c("TEST.DATA.1.1", "TEST.DATA.2.1"))
fun <- function(x) {
x <- data.table(x)
var1 <- grep("DATA.[0-9]", names(x), value = T)
var2 <- as.character(unlist(x[1, grep("DATA.NAME", names(x)), with = F]))
setnames(x, var1, var2)
x[, grep("DATA.NAME", names(x)) := NULL, with = F]
return(x)
}
data_new <- lapply(data, fun)
This should work...
library(dplyr)
for (i in 1:length(data))
{
d <- data[[i]]
# Find the new names
new_names <- select(d, starts_with('DATA.NAME'))
new_names <- unlist(new_names[1,])
names(new_names) <- NULL
new_names <- as.character(new_names)
# Remove the columns containing the names
d <- select(d, -starts_with('DATA.NAME'))
# Pick which columns we want to replace
old_names <- names(d)
to_replace <- grep('DATA.[0-9]+', old_names)
# Replace those names
names(d)[to_replace] <- new_names
#Replace the list element
data[[i]] <- d
}