I have a dataframe being returned from Microsoft365R:
SKA_student <- structure(list(name = "Computing SKA 2021-22.xlsx", size = 22266L,
lastModifiedBy =
structure(list(user =
structure(list(email = "my#email.com",
id = "8ae50289-d7af-4779-91dc-e4638421f422",
displayName = "Name, My"), class = "data.frame", row.names = c(NA, -1L))),
class = "data.frame", row.names = c(NA, -1L)),
fileSystemInfo = structure(list(
createdDateTime = "2021-09-08T16:03:38Z",
lastModifiedDateTime = "2021-09-16T00:09:04Z"), class = "data.frame", row.names = c(NA,-1L))), row.names = c(NA, -1L), class = "data.frame")
I can return all the lastModifiedBy data through:
SKA_student %>% select(lastModifiedBy)
lastModifiedBy.user.email lastModifiedBy.user.id lastModifiedBy.user.displayName
1 my#email.com 8ae50289-d7af-4779-91dc-e4638421f422 Name, My
But if I want a specific item in the lastModifiedBy list, it doesn't work, e.g.:
SKA_student %>% select(lastModifiedBy.user.email)
Error: Can't subset columns that don't exist.
x Column `lastModifiedBy.user.email` doesn't exist.
I can get this working through base, but would really like a dplyr answer
This function allows you to flatten all the list columns (I found this ages ago on SO but can't find the original post for credit)
SO_flat_cols <- function(data) {
ListCols <- sapply(data, is.list)
cbind(data[!ListCols], t(apply(data[ListCols], 1, unlist)))
}
Then you can select as you like.
SO_flat_cols (SKA_student) %>%
select(lastModifiedBy.user.email)
Alternatively you can get to the end by recursively pulling the lists
SKA_student %>%
pull(lastModifiedBy) %>%
pull(user) %>%
select(email)
You could use
library(dplyr)
library(tidyr)
SKA_student %>%
unnest_wider(lastModifiedBy) %>%
select(email)
This returns
# A tibble: 1 x 1
email
<chr>
1 my#email.com
Related
I've got a list of dataframes. I'd like to cbind them by the index column, sample_id. Each table has the same column headings, so I can't just cbind them otherwise I won't know which list item the columns came from. The name of the list item gives the measure used to generate them, so I'd like to suffix the column headings with the list item name.
Here's a simplified demo list of dataframes:
list_of_tables <- list(number = structure(list(sample_id = structure(1:3, levels = c("CSF_1",
"CSF_2", "CSF_4"), class = "factor"), total = c(655, 331, 271
), max = c(12, 5, 7)), row.names = c(NA, -3L), class = c("tbl_df",
"tbl", "data.frame")), concentration_cm_3 = structure(list(sample_id = structure(1:3, levels = c("CSF_1",
"CSF_2", "CSF_4"), class = "factor"), total = c(121454697, 90959097,
43080697), max = c(2050000, 2140000, 915500)), row.names = c(NA,
-3L), class = c("tbl_df", "tbl", "data.frame")), volume_nm_3 = structure(list(
sample_id = structure(1:3, levels = c("CSF_1", "CSF_2", "CSF_4"
), class = "factor"), total = c(2412783009, 1293649395, 438426087
), max = c(103500000, 117400000, 23920000)), row.names = c(NA,
-3L), class = c("tbl_df", "tbl", "data.frame")), area_nm_2 = structure(list(
sample_id = structure(1:3, levels = c("CSF_1", "CSF_2", "CSF_4"
), class = "factor"), total = c(15259297.4, 7655352.2, 3775922
), max = c(266500, 289900, 100400)), row.names = c(NA, -3L
), class = c("tbl_df", "tbl", "data.frame")))
You'll see it's a list of 4 tables, and the list item names are "number", "concentration_cm_3", "volume_nm_3", and "area_nm_2".
Using join_all from plyr I can merge them all by sample_id. However, how do I suffix with the list item name?
merged_tables <- plyr::join_all(stats_by_measure, by = "sample_id", type = "left")
we could do it this way:
The trick is to use .id = 'id' in bind_rows which adds the name as a column. Then we could pivot:
library(dplyr)
library(tidyr)
bind_rows(list_of_tables, .id = 'id') %>%
pivot_wider(names_from = id,
values_from = c(total, max))
sample_id total_number total_concentration_cm_3 total_volume_nm_3 total_area_nm_2 max_number max_concentration_cm_3 max_volume_nm_3 max_area_nm_2
<fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 CSF_1 655 121454697 2412783009 15259297. 12 2050000 103500000 266500
2 CSF_2 331 90959097 1293649395 7655352. 5 2140000 117400000 289900
3 CSF_4 271 43080697 438426087 3775922 7 915500 23920000 100400
Probably, we may use reduce2 here with suffix option from left_join
library(dplyr)
library(purrr)
nm <- names(list_of_tables)[1]
reduce2(list_of_tables, names(list_of_tables)[-1],
function(x, y, z) left_join(x, y, by = 'sample_id', suffix = c(nm, z)))
Or if we want to use join_all, probably we can rename the columns before doing the join
library(stringr)
imap(list_of_tables, ~ {
nm <- .y
.x %>% rename_with(~str_c(.x, nm), -1)
}) %>%
plyr::join_all( by = "sample_id", type = "left")
Or use a for loop
tmp <- list_of_tables[[1]]
names(tmp)[-1] <- paste0(names(tmp)[-1], names(list_of_tables)[1])
for(nm in names(list_of_tables)[-1]) {
tmp2 <- list_of_tables[[nm]]
names(tmp2)[-1] <- paste0(names(tmp2)[-1], nm)
tmp <- left_join(tmp, tmp2, by = "sample_id")
}
tmp
I have a simple dataframe as: dput(emp)
structure(list(name = structure(1L, .Label = "Alex", class = "factor"),
job = structure(1L, .Label = "", class = "factor"), Mgr = structure(1L, .Label = "", class = "factor"),
update = structure(18498, class = "Date")), class = "data.frame", row.names = c(NA,
-1L))
I want to convert all empty rows to NULL
The simplest way to achieve is:
emp[emp==""] <- NA
Which ofcourse would have worked but I get the error for the date column as:
Error in charToDate(x) :
character string is not in a standard unambiguous format
How can I convert all other empty rows to NULL without having to deal with the date column? Please note that the actual data frame has 30000+ rows.
Try formating the date variable as character, make the change and transform to date again:
#Format date
emp$update <- as.character(emp$update)
#Replace
emp[emp=='']<-NA
#Reformat date
emp$update <- as.Date(emp$update)
Output:
name job Mgr update
1 Alex <NA> <NA> 2020-08-24
You can try type.convert like below
type.convert(emp,as.is = TRUE)
such that
name job Mgr update
1 Alex NA NA 2020-08-24
You may try this using dplyr:
library(dplyr)
df %>%
mutate_at(vars(update),as.character) %>%
na_if(.,"")
As mentioned by #Duck, you have to format the date variable as character.
afterwards you can transform it back to date if you need it:
library(dplyr)
df %>%
mutate_at(vars(update),as.character) %>%
na_if(.,"") %>%
mutate_at(vars(update),as.Date)
See if this works:
> library(dplyr)
> library(purrr)
> emp <- structure(list(name = structure(1L, .Label = "Alex", class = "factor"),
+ job = structure(1L, .Label = "", class = "factor"), Mgr = structure(1L, .Label = "", class = "factor"),
+ update = structure(18498, class = "Date")), class = "data.frame", row.names = c(NA,
+ -1L))
> emp
name job Mgr update
1 Alex 2020-08-24
> emp %>% mutate(update = as.character(update)) %>% map_df(~gsub('^$',NA, .x)) %>% mutate(update = as.Date(update)) %>% mutate(across(1:3, as.factor))
# A tibble: 1 x 4
name job Mgr update
<fct> <fct> <fct> <date>
1 Alex NA NA 2020-08-24
>
I have a data.frame that contains a type column. The list contains a 1x3 data.frame. I only want one value from this list. Thus will flatten my data.frame so I can write out a csv.
How do I select one item from the nested data.frame (see the 2nd column)?
Here's the nested col. I'd provide the data but cannot flatten to write_csv.
result of dput:
structure(list(id = c("1386707", "1386700", "1386462", "1386340",
"1386246", "1386300"), fields.created = c("2020-05-07T02:09:27.000-0700",
"2020-05-07T01:20:11.000-0700", "2020-05-06T21:38:14.000-0700",
"2020-05-06T07:19:44.000-0700", "2020-05-06T06:11:43.000-0700",
"2020-05-06T02:26:44.000-0700"), fields.customfield_10303 = c(NA,
NA, 3, 3, NA, NA), fields.customfield_28100 = list(NULL, structure(list(
self = ".../rest/api/2/customFieldOption/76412",
value = "New Feature", id = "76412"), .Names = c("self",
"value", "id"), class = "data.frame", row.names = 1L), structure(list(
self = ".../rest/api/2/customFieldOption/76414",
value = "Technical Debt", id = "76414"), .Names = c("self",
"value", "id"), class = "data.frame", row.names = 1L), NULL,
structure(list(self = ".../rest/api/2/customFieldOption/76411",
value = "Maintenance", id = "76411"), .Names = c("self",
"value", "id"), class = "data.frame", row.names = 1L), structure(list(
self = ".../rest/api/2/customFieldOption/76412",
value = "New Feature", id = "76412"), .Names = c("self",
"value", "id"), class = "data.frame", row.names = 1L))), row.names = c(NA,
6L), class = "data.frame", .Names = c("id", "fields.created",
"fields.customfield_10303", "fields.customfield_28100"))
I found a way to do this.
First, instead of changing the data, I added a column with mutate. Then, directly selected the same column from all nested lists. Then, I converted the list column into a vector. Finally, I cleaned it up by removing the other columns.
It seems to work. I don't know yet how it will handle multiple rows within the nested df.
dat <- sample_dat %>%
mutate(cats = sapply(nested_col, `[[`, 2)) %>%
mutate(categories = sapply(cats, toString)) %>%
select(-nested_col, -cats)
Related
How to directly select the same column from all nested lists within a list?
r-convert list column into character vector where lists are characters
library(dplyr)
library(tidyr)
df <- tibble(Group=c("A","A","B","C","D","D"),
Batman=1:6,
Superman=c("red","blue","orange","red","blue","red"))
nested <- df %>%
nest(data=-Group)
unnested <- nested %>%
unnest(data)
Nesting and unnesting data with tidyr
library(purrr)
nested %>%
mutate(data=map(data,~select(.x,2))) %>%
unnest(data)
select with purrr, but lapply as you've done is fine, it's just for aesthetics ;)
I'm trying to reshape my data based on the value in a particular column (ie. "up" and "down"). The Up and Down are not in the same order in the data frame, so I'm having difficultly "casting" the data into the right shape.
I've tried used the cast function to shift the data, but I can't get the answers to work in a consistent (aka accurate) fashion.
This is my input:
input = structure(list(X = 1:6, Report = c("Sales.csv", "Sales.csv",
"Sales.csv", "Sales.csv", "Sales.csv", "Sales.csv"), Shock = c("Currencies.USD_Up",
"Currencies.USD_Down", "Currencies.AUD_Up", "Currencies.AUD_Down",
"Currencies.EUR_Down", "Currencies.EUR_Up"), Result = c(-519375.9816,
-7388851.423, -42950.77683, -667.367063, -12819532.15, -138054.0061
), FX = c("USD", "USD", "AUD", "AUD", "EUR", "EUR")), class = "data.frame", row.names = c(NA,
-6L))
and this is my preferred output:
output = structure(list(X = 1:3, Report = c("Sales.csv", "Sales.csv",
"Sales.csv"), Shock = c("Currencies.USD", "Currencies.AUD", "Currencies.EUR"
), Currency = c("USD", "AUD", "EUR"), Up = c(-519375.9816, -42950.77683,
-138054.0061), Down = c(-7388851.423, -667.367063, -12819532.15
)), class = "data.frame", row.names = c(NA, -3L))
Because the EUR data in the input is in a different order, I can't seem to make the data shape correctly. I've tried using the grep function to order this, but I can't make this work. Can anyone suggest a better way?
This is a tidyverse approach to do it:
library(dplyr)
library(tidyr)
library(tibble)
input %>%
as_tibble() %>%
separate(Shock, c("Shock", "tmp"), sep = "_") %>%
rename(Currency = FX) %>%
select(-X) %>%
spread(tmp, Result) %>%
mutate(X = row_number()) %>%
select(X, Report, Shock, Currency, Up, Down)
I have a single column in R that looks like this:
Path Column
ag.1.4->ao.5.5->iv.9.12->ag.4.35
ao.11.234->iv.345.455.1.2->ag.9.531
I want to transform this into:
Path Column
ag->ao->iv->ag
ao->iv->ag
How can I do this?
Thank you
Here is my full dput from my data:
structure(list(Rank = c(10394749L, 36749879L), Count = c(1L,
1L), Percent = c(0.001011122, 0.001011122), Path = c("ao.legacy payment.not_completed->ao.legacy payment.not_completed->ao.legacy payment.completed",
"ao.legacy payment.not_completed->agent.payment.completed")), .Names = c("Rank",
"Count", "Percent", "Path"), class = "data.frame", row.names = c(NA,
-2L))
You could use gsub to match the . and numbers following the . (\\.[0-9]+) and replace it with ''.
df1$Path.Column <- gsub('\\.[0-9]+', '', df1$Path.Column)
df1
# Path.Column
#1 ag -> ao -> iv -> ag
#2 ao -> iv -> ag
Update
For the new dataset df2
gsub('\\.[^->]+(?=(->|\\b))', '', df2$Path, perl=TRUE)
#[1] "ao->ao->ao" "ao->agent"
and for the string showed in the OP's post
str2 <- c('ag.1.4->ao.5.5->iv.9.12->ag.4.35',
'ao.11.234->iv.345.455.1.2->ag.9.531')
gsub('\\.[^->]+(?=(->|\\b))', '', str2, perl=TRUE)
#[1] "ag->ao->iv->ag" "ao->iv->ag"
data
df1 <- structure(list(Path.Column = c("ag.1 -> ao.5 -> iv.9 -> ag.4",
"ao.11 -> iv.345 -> ag.9")), .Names = "Path.Column",
class = "data.frame", row.names = c(NA, -2L))
df2 <- structure(list(Rank = c(10394749L, 36749879L), Count = c(1L,
1L), Percent = c(0.001011122, 0.001011122),
Path = c("ao.legacy payment.not_completed->ao.legacy payment.not_completed->ao.legacy payment.completed",
"ao.legacy payment.not_completed->agent.payment.completed")),
.Names = c("Rank", "Count", "Percent", "Path"), class = "data.frame",
row.names = c(NA, -2L))
It may be easeir to split the strings on '->' and process the substrings separately
# split the stirngs into parts
subStrings <- strsplit(df$Path,'->')
# remove eveything after **first** the dot
subStrings<- lapply(subStrings,
function(x)gsub('\\..*','',x))
# paste them back together.
sapply(subStrings,paste0,collapse="->")
#> "ao->ao->ao" "ao->agent"
or
# split the stirngs into parts
subStrings <- strsplit(df$Path,'->')
# remove the parts of the identifiers after the dot
subStrings<- lapply(subStrings,
function(x)gsub('\\.[^ \t]*','',x))
# paste them back together.
sapply(subStrings,paste0,collapse="->")
#> "ao payment->ao payment->ao payment" "ao payment->agent"