Write a Data.Table as a csv file - r

I have a data.table that has list values within the columns. Below is the dput:
dput(df2)
structure(list(a = list(structure(5594.05118603497, .Names = "a"),
structure(8877.42723091876, .Names = "a"), structure(2948.95666065332,
.Names = "a"),
structure(5312.77623937465, .Names = "a"), structure(676.637044992807,
.Names = "a"),
structure(323.104243007498, .Names = "a")), b =
list(structure(3.90258318853593e-06, .Names = "b"),
structure(3.89772483584672e-06, .Names = "b"), structure(3.91175458242421e-
06, .Names = "b"),
structure(3.90169532031545e-06, .Names = "b"), structure(6.54536728417568e-
06, .Names = "b"),
structure(6.59087917747312e-06, .Names = "b")), id = 1:6), .Names = c("a",
"b", "id"), class = c("data.table", "data.frame"), row.names = c(NA,
-6L), .internal.selfref = <pointer: 0x0000000000220788>)
Here is what the output looks like:
head(df2)
a b id
1: 5594.051 3.902583e-06 1
2: 8877.427 3.897725e-06 2
3: 2948.957 3.911755e-06 3
4: 5312.776 3.901695e-06 4
5: 676.637 6.545367e-06 5
6: 323.1042 6.590879e-06 6
This looks ok when you see it at first but if you look further into it, this is what it looks like when I want to select a column:
How do I change df2 to just be a normal dataframe where it doesn't have these extra values within a and b like this? I am trying to write this file to a csv but it will not allow me to because it is saying there are vectors as the values.
Thanks!
Edit:
This was the code that generated the lists:
test<-sapply( split( df , df$ID),
function(d){ dat <- list2env(d)
nlsfit <- nls( form = y ~ a * (1-exp(-b * x)), data=dat,
start= list( a=max(dat$y), b=b.start),
control= control1)
list(a = coef(nlsfit)[1], b = coef(nlsfit)[2])} )
df1<-as.data.frame(t(test))

Load the right package, look at its help page, search for "csv", follow the Usage section:
library(data.table)
help(pac=data.table)
fwrite(df2, file="~/test.csv") # for mac, need changing for other OS
Another approach might be:
as.data.frame( lapply(df2, unlist) )

Related

How to cbind a list of tables by one column, and suffix headings with the list item name

I've got a list of dataframes. I'd like to cbind them by the index column, sample_id. Each table has the same column headings, so I can't just cbind them otherwise I won't know which list item the columns came from. The name of the list item gives the measure used to generate them, so I'd like to suffix the column headings with the list item name.
Here's a simplified demo list of dataframes:
list_of_tables <- list(number = structure(list(sample_id = structure(1:3, levels = c("CSF_1",
"CSF_2", "CSF_4"), class = "factor"), total = c(655, 331, 271
), max = c(12, 5, 7)), row.names = c(NA, -3L), class = c("tbl_df",
"tbl", "data.frame")), concentration_cm_3 = structure(list(sample_id = structure(1:3, levels = c("CSF_1",
"CSF_2", "CSF_4"), class = "factor"), total = c(121454697, 90959097,
43080697), max = c(2050000, 2140000, 915500)), row.names = c(NA,
-3L), class = c("tbl_df", "tbl", "data.frame")), volume_nm_3 = structure(list(
sample_id = structure(1:3, levels = c("CSF_1", "CSF_2", "CSF_4"
), class = "factor"), total = c(2412783009, 1293649395, 438426087
), max = c(103500000, 117400000, 23920000)), row.names = c(NA,
-3L), class = c("tbl_df", "tbl", "data.frame")), area_nm_2 = structure(list(
sample_id = structure(1:3, levels = c("CSF_1", "CSF_2", "CSF_4"
), class = "factor"), total = c(15259297.4, 7655352.2, 3775922
), max = c(266500, 289900, 100400)), row.names = c(NA, -3L
), class = c("tbl_df", "tbl", "data.frame")))
You'll see it's a list of 4 tables, and the list item names are "number", "concentration_cm_3", "volume_nm_3", and "area_nm_2".
Using join_all from plyr I can merge them all by sample_id. However, how do I suffix with the list item name?
merged_tables <- plyr::join_all(stats_by_measure, by = "sample_id", type = "left")
we could do it this way:
The trick is to use .id = 'id' in bind_rows which adds the name as a column. Then we could pivot:
library(dplyr)
library(tidyr)
bind_rows(list_of_tables, .id = 'id') %>%
pivot_wider(names_from = id,
values_from = c(total, max))
sample_id total_number total_concentration_cm_3 total_volume_nm_3 total_area_nm_2 max_number max_concentration_cm_3 max_volume_nm_3 max_area_nm_2
<fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 CSF_1 655 121454697 2412783009 15259297. 12 2050000 103500000 266500
2 CSF_2 331 90959097 1293649395 7655352. 5 2140000 117400000 289900
3 CSF_4 271 43080697 438426087 3775922 7 915500 23920000 100400
Probably, we may use reduce2 here with suffix option from left_join
library(dplyr)
library(purrr)
nm <- names(list_of_tables)[1]
reduce2(list_of_tables, names(list_of_tables)[-1],
function(x, y, z) left_join(x, y, by = 'sample_id', suffix = c(nm, z)))
Or if we want to use join_all, probably we can rename the columns before doing the join
library(stringr)
imap(list_of_tables, ~ {
nm <- .y
.x %>% rename_with(~str_c(.x, nm), -1)
}) %>%
plyr::join_all( by = "sample_id", type = "left")
Or use a for loop
tmp <- list_of_tables[[1]]
names(tmp)[-1] <- paste0(names(tmp)[-1], names(list_of_tables)[1])
for(nm in names(list_of_tables)[-1]) {
tmp2 <- list_of_tables[[nm]]
names(tmp2)[-1] <- paste0(names(tmp2)[-1], nm)
tmp <- left_join(tmp, tmp2, by = "sample_id")
}
tmp

execute different functions considering output in r

Let's say I have 2 different functions to apply. For example, these functions are max and min . After applying bunch of functions I am getting outputs below. I want to assign a function to each output.
Here is my data and its structure.
data<-structure(list(Apr = structure(list(`a1` = structure(list(
date = c("04-01-2036", "04-02-2036", "04-03-2036"), value = c(0,
3.13, 20.64)), .Names = c("date", "value"), row.names = 92:94, class = "data.frame"),
`a2` = structure(list(date = c("04-01-2037", "04-02-2037",
"04-03-2037"), value = c(5.32, 82.47, 15.56)), .Names = c("date",
"value"), row.names = 457:459, class = "data.frame")), .Names = c("a1",
"a2")), Dec = structure(list(`d1` = structure(list(
date = c("12-01-2039", "12-02-2039", "12-03-2039"), value = c(3,
0, 11)), .Names = c("date", "value"), row.names = 1431:1433, class = "data.frame"),
`d2` = structure(list(date = c("12-01-2064", "12-02-2064",
"12-03-2064"), value = c(0, 5, 0)), .Names = c("date", "value"
), row.names = 10563:10565, class = "data.frame")), .Names = c("d1",
"d2"))), .Names = c("Apr", "Dec"))
I applied these functions:
drop<-function(y){
lapply(y, function(x)(x[!(names(x) %in% c("date"))]))
}
q1<-lapply(data, drop)
q2<-lapply(q1, function(x) unlist(x,recursive = FALSE))
daily_max<-lapply(q2, function(x) lapply(x, max))
dailymax <- data.frame(matrix(unlist(daily_max), nrow=length(daily_max), byrow=TRUE))
row.names(dailymax)<-names(daily_max)
max_value <- apply(dailymax, 1, which.max)
And I'm getting
Apr Dec
2 1
And I am applying any random function to both Apr[2] and Dec[1] like:
Map(function(x, y) sum(x[[y]]), q2, max_value)
So, the function will be executed considering the outputs (to Apr's second element which is a1, Dec's first element which is a2.) As you can see, there are outputs as numbers 1 and 2.
What I want
What I want is assigning specific functions to 1 and 2. If output is 1 then max function; if it is 2, min function will be executed. In conclusion, max function will be applied to Apr[2] and min function will be applied to Dec[1].
I will get this:
min(q2$Apr$a2.value)
[1] 5.32
max(q2$Dec$d2.value)
[1] 5
How can I achieve this automatically for all my functions?
You can take help of switch here to apply a function based on number in max_value.
apply_function <- function(x, num) switch(num, `1` = max, `2` = min)(x)
Map(function(x, y) apply_function(x[[y]], y), q2, max_value)
#$Apr
#[1] 5.32
#$Dec
#[1] 11
Map returns a list if you want a vector output use mapply.

Match strings from main df with those in reference df - if found, add all cols from that row of the reference df into main df

I have a "main_df" along the lines of this:
structure(list(study_id = c("02ipnnqgeovkrxz", "02ipnnqgeovkrxz",
"02ipnnqgeovkrxz", "02ipnnqgeovkrxz", "02ipnnqgeovkrxz", "02ipnnqgeovkrxz"
), question = c("3eEVJgaAP6c9FPL", "b8GLxGjZKtstCQZ", "40iyFKjeMEFGI2V",
"6eZGejSZ1oTZYLb", "3pXAUvZH8GGuryd", "0kYkUAHe4iODUl7"), study_rt = c("1.353",
"0.714", "0.68", "0.695", "0.696", "0.656"), study_response = c("picture",
"picture", "picture", "picture", "picture", "picture")), row.names = c(NA,
-6L), class = c("grouped_df", "tbl_df", "tbl", "data.frame"), reshapeWide = list(
v.names = NULL, timevar = "index", idvar = c("study_id",
"question"), times = c("rt", "response"), varying = structure(c("response.rt",
"response.response"), .Dim = 1:2)), groups = structure(list(
study_id = "02ipnnqgeovkrxz", .rows = list(1:6)), row.names = c(NA,
-1L), class = c("tbl_df", "tbl", "data.frame"), .drop = TRUE))
and a reference df along the lines of this:
structure(list(stim = c("ashtray_word", "bell_word", "blouse_word",
"boot_word", "bottle_word", "bread_word"), url = c("eW1BRoUDV4BKQMl",
"5zKTGwHlwlzpssB", "55SVfoQudZJNCFT", "bOORR1zuKYSnAe9", "6RrOQfDZim81pHv",
"1F97ouH0HrwQOgZ"), study_list = c("A", "A", "A", "A", "A", "A"
)), row.names = c(NA, -6L), class = c("tbl_df", "tbl", "data.frame"))
Each value in the 'question' column of the main df can be found in the 'url' column of the reference df. I want to match these values, and add all columns from that row of the reference df to my main df. The output will look like this:
structure(list(study_id = c("02ipnnqgeovkrxz", "02ipnnqgeovkrxz",
"02ipnnqgeovkrxz", "02ipnnqgeovkrxz", "02ipnnqgeovkrxz", "02ipnnqgeovkrxz"
), question = c("3eEVJgaAP6c9FPL", "b8GLxGjZKtstCQZ", "40iyFKjeMEFGI2V",
"6eZGejSZ1oTZYLb", "3pXAUvZH8GGuryd", "0kYkUAHe4iODUl7"), study_rt = c("1.353",
"0.714", "0.68", "0.695", "0.696", "0.656"), study_response = c("picture",
"picture", "picture", "picture", "picture", "picture"), stim = c("chisel_picture",
"raccoon_picture", "apple_picture", "belt_picture", "bicycle_picture",
"cake_picture"), url = c("3eEVJgaAP6c9FPL", "b8GLxGjZKtstCQZ",
"40iyFKjeMEFGI2V", "6eZGejSZ1oTZYLb", "3pXAUvZH8GGuryd", "0kYkUAHe4iODUl7"
), study_list = c("B FILLER", "B FILLER", "B", "B", "B", "B")), row.names = c(NA,
-6L), groups = structure(list(study_id = "02ipnnqgeovkrxz", .rows = list(
1:6)), row.names = c(NA, -1L), class = c("tbl_df", "tbl",
"data.frame"), .drop = TRUE), class = c("grouped_df", "tbl_df", "tbl", "data.frame"))
This will allow me to see the 'sensible' item names (e.g. "chisel_picture") that subjects were responding to, as opposed to the nonsensical code names I have now (e.g. "3eEVJgaAP6c9FPL"). The same items appear over and over again in the 'question' column (as different subjects saw the same items), and I need to preserve these repeats.
I have successfully managed this using a for loop...but it's super slow! A tidyverse solution would be amazing!
My awful for loop (study_data = main df / image_urls = reference df):
all_study_stim_items <- study_data$question # List all values in 'question' column.
matched_items <- tibble() # Create empty tibble to store results of for loop.
for (i in all_study_stim_items) {
temp <- image_urls %>%
filter(url == i) %>%
select(stim, url, study_list)
matched_items <- bind_rows(matched_items, temp) } # Continuously overwrite tibble with each match.
# I then join this with the main df.

Lexis function not found in R

I am using this code from the R help guide in the Epi
package:
# A small bogus cohort
xcoh <- structure( list( id = c("A", "B", "C"),
birth = c("14/07/1952", "01/04/1954",
"10/06/1987"),
entry = c("04/08/1965", "08/09/1972",
"23/12/1991"),
exit = c("27/06/1997", "23/05/1995",
"24/07/1998"),
fail = c(1, 0, 1) ),
.Names = c("id", "birth", "entry", "exit",
"fail"),
row.names = c("1", "2", "3"),
class = "data.frame" )
# Define a Lexis object with timescales calendar time and
age
Lcoh <- Lexis( entry = list( per=entry ),
exit = list( per=exit,
age=exit-birth ),
exit.status = fail,
data = xcoh )
But I get this error:
Error in Lexis(entry = list(per = entry), exit = list(per = exit, age = exit - :
could not find function "Lexis"
Any thoughts?
Epi package first needs to be installed in the environment using:
install.packages("Epi")
And then the library for Epi needs to be loaded.
library(Epi)
Hence your code being modified as follows:
install.packages("Epi")
library(Epi)
xcoh <- structure( list( id = c("A", "B", "C"),
birth = c("14/07/1952", "01/04/1954",
"10/06/1987"),
entry = c("04/08/1965", "08/09/1972",
"23/12/1991"),
exit = c("27/06/1997", "23/05/1995",
"24/07/1998"),
fail = c(1, 0, 1) ),
.Names = c("id", "birth", "entry", "exit",
"fail"),
row.names = c("1", "2", "3"),
class = "data.frame" )
# Define a Lexis object with timescales calendar time and
Lcoh <- Lexis( entry = list( per=entry ),
exit = list( per=exit,
age=exit-birth ),
exit.status = fail,
data = xcoh )
Note: I have removed the line that says age. Assuming it is not relevant to the question posted here.

Splitting column of a data.frame into more columns

I want to split the Out column of Test data.frame into columns separating based on blank space. Here is my MWE. I tried separate function from tidyr package and strsplit function from base R but couldn't figured out the problem.
Test <-
structure(list(Out = structure(1:2, .Label = c("t1* -0.4815861 0.3190424 0.2309631",
"t2* 0.9189246 -0.1998455 0.2499412"), class = "factor")),
.Names = "Out", row.names = c(NA, -2L), class = "data.frame")
library(dplyr)
library(tidyr)
Test %>% separate(Out, c("A", "B", "C", "D"), sep = " ")
Error: Values not split into 4 pieces at 1, 2
strsplit(Test$Out, " ")
Error in strsplit(Test$Out, " ") : non-character argument
try
Test %>% separate(Out, c("A", "B", "C", "D"), sep = "\\s+")
which allows for multiple spaces (\\s+).

Resources