Related
I have a series of about 300 data frames each structured the same way and want to write a code that will turn each of them into their own bar graph. I am struggling to write a code that structures the graph correctly in the first place. My data frames look like this as an example:
precursorMz Mz_Round HW Intensity Reg Intensity diff1 diff2
1 256.6814 141.10 4216 3994 0.96 1.00
2 256.6814 142.10 7184 5988 1.00 1.02
3 256.6814 143.12 44510 30020 1.02 1.00
4 256.6814 144.12 1858 1312 1.00 0.00
5 256.6814 260.20 43010 23230 4.52 1.00
6 256.6814 261.20 9452 6388 1.00 0.99
I want my graph to have the Mz_Round column be the X axis and then my Y values be HW Intensity and Reg Intensity.
I have tried using the barplot() function but again am having issues with getting my axes to be correct.
intensities <- table(split1$`HW Intensity`, split1$`Reg Intensity`)
barplot(intensities,
main = "Intensity Compared",
xlab = "M/z", ylab = "Intensity",
col = c("darkgrey", "blue"),
rownames(split1$Mz_Round),
beside = TRUE)
I have tried a couple of plots. I hope this helps.
# Data
> dput(df)
structure(list(precursor_Mz = c(256.6814, 256.6814, 256.6814,
256.6814, 256.6814, 256.6814), Mz_Round = c(141.1, 142.1, 143.12,
144.12, 260.2, 261.2), HW_Intensity = c(4216, 7184, 44510, 1858,
43010, 9452), Reg_Intensity = c(3994, 5988, 30020, 1312, 23230,
6388), diff1 = c(0.96, 1, 1.02, 1, 4.52, 1), diff2 = c(1, 1.02,
1, 0, 1, 0.99)), class = c("spec_tbl_df", "tbl_df", "tbl", "data.frame"
), row.names = c(NA, -6L), spec = structure(list(cols = list(
precursor_Mz = structure(list(), class = c("collector_double",
"collector")), Mz_Round = structure(list(), class = c("collector_double",
"collector")), HW_Intensity = structure(list(), class = c("collector_double",
"collector")), Reg_Intensity = structure(list(), class = c("collector_double",
"collector")), diff1 = structure(list(), class = c("collector_double",
"collector")), diff2 = structure(list(), class = c("collector_double",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), skip = 1L), class = "col_spec"))
library(tidyverse)
# pivoting data
df1 <- df|>
select("Mz_Round", "HW_Intensity", "Reg_Intensity")|>
pivot_longer(!Mz_Round)
# stacked bar plot
ggplot(df1) +
geom_col(aes(x = as.factor(Mz_Round), y = value, fill = name))
# dodged bar plot
ggplot(df1) +
geom_col(aes(x = as.factor(Mz_Round), y = value, fill = name), position = "dodge")
I have a function that filters a data.frame based on the unique values of a group column that is passed to the function
la <- function(df, grp){
gr <- df %>% pull({{grp}}) %>% unique()
purrr::map(gr, function(x){
print(x)
filter(df, {{grp}} == x)
})
}
When I use it with this df,
x <- structure(list(mac = c("dc:a6:32:21:59:2b", "dc:a6:32:2d:8c:ca",
"dc:a6:32:2d:b8:62", "dc:a6:32:2d:ca:3f"), datetime = structure(c(1594644546,
1594645457, 1594645375, 1594645080), tzone = "UTC", class = c("POSIXct",
"POSIXt")), Comment = c("FED2", "FED7", "FED1", "FED6")), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -4L))
la(x, mac)
I get the proper prints and the subsets.
However, when I use it with this other df, which should be equivalent, it doesn't work as expected.
df <- structure(list(datetime = structure(c(1594644600, 1594644900,
1594645200, 1594645500, 1594645800, 1594646100), class = c("POSIXct",
"POSIXt"), tzone = "UTC"), movement = c(9940.50454596681, 10779.7747307276,
7148.52826988968, 7687.54314683339, 8797.06954533588, 7524.02474093548
), x = c(606, NA, 240, NA, 504, NA), y = c(386, NA, 274, NA,
56, NA), i_x = c(606, 228, 214, 407.5, 500, 292.947368421053),
i_y = c(386, 286, 258, 49.1666666666667, 56, 234), mac = c("dc:a6:32:21:59:2b",
"dc:a6:32:21:59:2b", "dc:a6:32:21:59:2b", "dc:a6:32:21:59:2b",
"dc:a6:32:21:59:2b", "dc:a6:32:21:59:2b")), spec = structure(list(
cols = list(filename = structure(list(), class = c("collector_character",
"collector")), datetime = structure(list(format = ""), class = c("collector_datetime",
"collector")), movement = structure(list(), class = c("collector_double",
"collector")), x = structure(list(), class = c("collector_double",
"collector")), y = structure(list(), class = c("collector_double",
"collector")), i_x = structure(list(), class = c("collector_double",
"collector")), i_y = structure(list(), class = c("collector_double",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), delim = "\t"), class = "col_spec"), row.names = c(NA,
-6L), class = c("tbl_df", "tbl", "data.frame"))
I get 0 rows on each type of group (my real example has the same groups as the ones for the x dataframe).
Interestingly, this works as expected.
la(select(head(df), mac, datetime), mac)
[1] "dc:a6:32:21:59:2b"
[[1]]
# A tibble: 6 x 2
mac datetime
<chr> <dttm>
1 dc:a6:32:21:59:2b 2020-07-13 12:50:00
2 dc:a6:32:21:59:2b 2020-07-13 12:55:00
3 dc:a6:32:21:59:2b 2020-07-13 13:00:00
4 dc:a6:32:21:59:2b 2020-07-13 13:05:00
5 dc:a6:32:21:59:2b 2020-07-13 13:10:00
6 dc:a6:32:21:59:2b 2020-07-13 13:15:00
What is going on?
As the comment suggests, the problem is that I have function(x) inside the map call and because df has an x column, things become weird. I chose another variable name for that, and now it's working.
la <- function(df, grp){
gr <- df %>% pull({{grp}}) %>% unique()
purrr::map(gr, function(tt){
print(tt)
filter(df, {{grp}} == tt)
})
}
I am trying to map over a list of data frames in R but not getting it right. What I am trying is:
lst %>%
map(~mutate(., NewColumn1 = .x$value*2,))
With error:
Error: Column NewColumn1 must be length 2 (the number of rows) or
one, not 0 In addition: Warning message: Unknown or uninitialised
column: 'value'.
The data looks like:
[[9]]
# A tibble: 2 x 4
time ID Value out
<date> <chr> <dbl> <dbl>
1 2016-12-23 CAT1 790. 0
2 2016-12-27 CAT1 792. 1
[[10]]
# A tibble: 2 x 4
time ID Value out
<date> <chr> <dbl> <dbl>
1 2016-12-28 CAT1 785. 0
2 2016-12-29 CAT1 783. 0
DATA:
Data <- list(structure(list(time = structure(c(17136, 17137), class = "Date"),
ID = c("CAT1", "CAT1"), Value = c(747.919983, 750.5), out = c(0,
1)), row.names = c(NA, -2L), class = c("tbl_df", "tbl", "data.frame"
)), structure(list(time = structure(c(17140, 17141), class = "Date"),
ID = c("CAT1", "CAT1"), Value = c(762.52002, 759.109985),
out = c(1, 0)), row.names = c(NA, -2L), class = c("tbl_df",
"tbl", "data.frame")), structure(list(time = structure(c(17142,
17143), class = "Date"), ID = c("CAT1", "CAT1"), Value = c(771.190002,
776.419983), out = c(1, 1)), row.names = c(NA, -2L), class = c("tbl_df",
"tbl", "data.frame")), structure(list(time = structure(c(17144,
17147), class = "Date"), ID = c("CAT1", "CAT1"), Value = c(789.289978,
789.27002), out = c(1, 1)), row.names = c(NA, -2L), class = c("tbl_df",
"tbl", "data.frame")), structure(list(time = structure(c(17148,
17149), class = "Date"), ID = c("CAT1", "CAT1"), Value = c(796.099976,
797.070007), out = c(1, 0)), row.names = c(NA, -2L), class = c("tbl_df",
"tbl", "data.frame")), structure(list(time = structure(c(17150,
17151), class = "Date"), ID = c("CAT1", "CAT1"), Value = c(797.849976,
790.799988), out = c(1, 0)), row.names = c(NA, -2L), class = c("tbl_df",
"tbl", "data.frame")), structure(list(time = structure(c(17154,
17155), class = "Date"), ID = c("CAT1", "CAT1"), Value = c(794.200012,
796.419983), out = c(1, 0)), row.names = c(NA, -2L), class = c("tbl_df",
"tbl", "data.frame")), structure(list(time = structure(c(17156,
17157), class = "Date"), ID = c("CAT1", "CAT1"), Value = c(794.559998,
791.26001), out = c(0, 0)), row.names = c(NA, -2L), class = c("tbl_df",
"tbl", "data.frame")), structure(list(time = structure(c(17158,
17162), class = "Date"), ID = c("CAT1", "CAT1"), Value = c(789.909973,
791.549988), out = c(0, 1)), row.names = c(NA, -2L), class = c("tbl_df",
"tbl", "data.frame")), structure(list(time = structure(c(17163,
17164), class = "Date"), ID = c("CAT1", "CAT1"), Value = c(785.049988,
782.789978), out = c(0, 0)), row.names = c(NA, -2L), class = c("tbl_df",
"tbl", "data.frame")))
Take a look at the error message Unknown or uninitialised column: 'value'., then look at your code map(Data, ~mutate(., NewColumn1 = .x$value*2,)). The column name is Value and not value (case is important!).
Your syntax can also be cleaned up a bit. Try map(Data, ~mutate(., NewColumn1 = Value*2)). Technically, I think . and .x refer to the same thing, but it's better to be consistent. In mutate you also don't need to subset the data frame, i.e. mutate(df, new_col = old_col) is enough, you don't need mutate(df, new_col = .$old_col).
So I'm running a package in which the output of the function I'm using is something similar to this:
area ID structure
1 150 1 house
I have several of these which I get by looping through some stuff. Basically this is my loop function:
for (k in 1:length(models)) {
for (l in 1:length(patients)) {
print(result[[l]][[k]])
tableData[[l]][[k]] <- do.call(rbind, result[[l]][[k]])
}
}
So the print(result[[l]][[k]]) gives the output I showed you in the beginning. So my issue is to put all of these into one dataframe. And so far it just doesn't work, i.e. the do.call function, which I have read is the one to use when combining lists into dataframes.
So where am I going wrong here ?
Updated:
dput() output (area = value in this case):
list(list(structure(list(value = 0.0394797760472196, ID = "1 house",
structure = "house", model = structure(1L, .Label = "wood", class = "factor")), .Names = c("value",
"ID", "structure", "model"), row.names = c(NA, -1L), class = "data.frame"),
structure(list(value = 0.0394797760472196, ID = "1 house",
structure = "house", model = structure(1L, .Label = "stone", class = "factor")), .Names = c("value",
"ID", "structure", "model"), row.names = c(NA, -1L), class = "data.frame")),
list(structure(list(value = 0.0306923865158472, ID = "2 house",
structure = "house", model = structure(1L, .Label = "wood", class = "factor")), .Names = c("value",
"ID", "structure", "model"), row.names = c(NA, -1L), class = "data.frame"),
structure(list(value = 0.0306923865158472, ID = "2 house",
structure = "house", model = structure(1L, .Label = "stone", class = "factor")), .Names = c("value",
"ID", "structure", "model"), row.names = c(NA, -1L
), class = "data.frame")))
list(list(structure(list(value = 0.0394797760472196, ID = "1 house",
structure = "house", model = structure(1L, .Label = "wood", class = "factor")), .Names = c("value",
"ID", "structure", "model"), row.names = c(NA, -1L), class = "data.frame"),
structure(list(value = 0.0394797760472196, ID = "1 house",
structure = "house", model = structure(1L, .Label = "stone", class = "factor")), .Names = c("value",
"ID", "structure", "model"), row.names = c(NA, -1L), class = "data.frame")),
list(structure(list(value = 0.0306923865158472, ID = "2 house",
structure = "house", model = structure(1L, .Label = "wood", class = "factor")), .Names = c("value",
"ID", "structure", "model"), row.names = c(NA, -1L), class = "data.frame"),
structure(list(value = 0.0306923865158472, ID = "2 house",
structure = "house", model = structure(1L, .Label = "stone", class = "factor")), .Names = c("value",
"ID", "structure", "model"), row.names = c(NA, -1L
), class = "data.frame")))
Edit: I initially used purrr::map_dfr to solve this problem, but purrr::reduce is much more appropriate.
The list nesting means we have to bind rows together twice. Here's a solution using the purrr and dplyr packages and assigning your dput list to the variable my_list:
library(purrr)
library(dplyr)
my_df <- reduce(my_list, bind_rows)
#> Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character
#> Warning in bind_rows_(x, .id): binding character and factor vector,
#> coercing into character vector
#> Warning in bind_rows_(x, .id): binding character and factor vector,
#> coercing into character vector
#> Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character
#> Warning in bind_rows_(x, .id): binding character and factor vector,
#> coercing into character vector
#> Warning in bind_rows_(x, .id): binding character and factor vector,
#> coercing into character vector
my_df
#> value ID structure model
#> 1 0.03947978 1 house house wood
#> 2 0.03947978 1 house house stone
#> 3 0.03069239 2 house house wood
#> 4 0.03069239 2 house house stone
I find map-ing with purrr way more intuitive than do.call. Let me know if this helps!
I intend to find customers who have bought exactly the same products,
The data I have is customers' behaviors--what they have bought.
The example that I provided is a simplified version of my data. Customers will usually buy 10 to 20 products. There are around 50 products that consumers could choose to buy.
I am really confused what is an easy way to transform my data into the output that I prefer.
Could you please give me any advice? Thanks
Input:
structure(list(Customer_ID = 1:6, Products = c("Apple, Beer, Diaper",
"Beer, Apple", "Beer, Apple, Diaper, Diaper", "Apple, Diaper",
"Diaper, Apple", "Apple, Diaper, Beer, Beer")), .Names = c("Customer_ID",
"Products"), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-6L), spec = structure(list(cols = structure(list(Customer_ID = structure(list(), class = c("collector_integer",
"collector")), Products = structure(list(), class = c("collector_character",
"collector"))), .Names = c("Customer_ID", "Products")), default = structure(list(), class = c("collector_guess",
"collector"))), .Names = c("cols", "default"), class = "col_spec"))
Output:
structure(list(`Products Bought` = c("Apple, Beer, Diaper", "Apple, Diaper"
), Customer_ID = c("1, 3, 6", "4, 5")), .Names = c("Products Bought",
"Customer_ID"), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-2L), spec = structure(list(cols = structure(list(`Products Bought` = structure(list(), class = c("collector_character",
"collector")), Customer_ID = structure(list(), class = c("collector_character",
"collector"))), .Names = c("Products Bought", "Customer_ID")),
default = structure(list(), class = c("collector_guess",
"collector"))), .Names = c("cols", "default"), class = "col_spec"))
I am suspicious that you may want to look at structuring your data in a way that is more usable. In any case, the tidyverse can be a helpful way of thinking through your task.
As mentioned, posting code for others to start with can save them time and get you an answer faster.
library(dplyr)
library(stringr)
library(tidyr)
d <- data_frame(id=c(1,2,3,4,5,6)
, bought=c('Apple, Beer, Diaper','Apple, Beer', 'Apple, Beer, Diaper, Diaper'
, 'Apple, Diaper', 'Diaper, Apple', 'Apple, Diaper, Beer, Beer'))
d %>%
## Unnest the values & take care of white space
## - This is the better data structure to have, anyways
mutate(buy=str_split(bought,',')) %>%
unnest(buy) %>% mutate(buy=str_trim(buy)) %>% select(-bought) %>%
## Get distinct (and sort?)
distinct(id, buy) %>% arrange(id, buy) %>%
## Aggregate by id
group_by(id) %>% summarize(bought=paste(buy,collapse=', ')) %>% ungroup %>%
## Count
group_by(bought) %>% summarize(ids=paste(id,collapse=',')) %>% ungroup
EDIT: referencing this SO post for getting distinct combinations faster / cleaner in dplyr
Using the given input data and data.table, this can be written as (rather convoluted) "one-liner":
dcast(unique(setDT(input)[, strsplit(Products, ", "), Customer_ID])[
order(Customer_ID, V1)],
Customer_ID ~ ., paste, collapse = ", ")[
, .(Customers = paste(Customer_ID, collapse = ", ")), .(Products = .)]
# Products Customers
#1: Apple, Beer, Diaper 1, 3, 6
#2: Apple, Beer 2
#3: Apple, Diaper 4, 5
Note that the OP has dropped the second line with only one customer from
the expected output but hasn't mentioned any criteria for filtering the output in the question.
Input data
(As given by OP):
input <- structure(list(Customer_ID = 1:6, Products = c("Apple, Beer, Diaper",
"Beer, Apple", "Beer, Apple, Diaper, Diaper", "Apple, Diaper",
"Diaper, Apple", "Apple, Diaper, Beer, Beer")), .Names = c("Customer_ID",
"Products"), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-6L), spec = structure(list(cols = structure(list(Customer_ID = structure(list(), class = c("collector_integer",
"collector")), Products = structure(list(), class = c("collector_character",
"collector"))), .Names = c("Customer_ID", "Products")), default = structure(list(), class = c("collector_guess",
"collector"))), .Names = c("cols", "default"), class = "col_spec"))