Summarize dataframe with start and end times in R?

Summarize dataframe with start and end times in R? - r

Here is a sample of my df:
structure(list(press_id = c(1L, 1L, 2L, 2L, 3L, 3L, 4L, 4L),
start_time = c(164429106370978, 164429106370978, 164429411618824,
164429411618824, 164429837271939, 164429837271939, 164430399454284,
164430399454284), end_time = c(164429182443824, 164429182443824,
164429512525747, 164429512525747, 164429903243169, 164429903243169,
164430465927554, 164430465927554), timestamp = c(164429140697138,
164429175921880, 164429440899844, 164429440899844, 164429867184830,
164429891199391, 164430427558256, 164430433561155), acc_x = c(3.1053743,
2.9904492, 5.889916, 5.889916, 5.808511, 5.36557, 3.545921,
3.4788814), acc_y = c(8.406299, 8.12138, 8.600235, 8.600235,
7.920261, 7.922655, 7.9346266, 7.972935), acc_z = c(4.577853,
4.0894213, 0.35435268, 0.35435268, -0.21309046, 0.46927786,
4.005622, 4.4198313), grav_x = c(3.931084, 4.0214577, 4.7844357,
4.7844357, 5.6572776, 5.65053, 3.9938855, 3.9938855), grav_y = c(8.318872,
8.281514, 8.21449, 8.21449, 7.94851, 7.9495893, 8.027369,
8.027369), grav_z = c(3.393116, 3.3785365, 2.408623, 2.408623,
0.99327636, 1.0226398, 3.9724596, 3.9724596), gyro_x = c(-0.35906965,
0.099690154, 0.06792516, 0.04532315, -0.05546962, -0.06524346,
-0.2967614, -0.32180685), gyro_y = c(0.15843217, -0.48053285,
-0.2196934, -0.21175216, 0.1895863, 0.37467846, 0.12239113,
0.04847643), gyro_z = c(-0.042139318, 0.39585108, 0.12523776,
0.11240959, -0.05863268, 0.042770952, 0.047047008, 0.097137965
), acc_mag = c(10.0630984547559, 9.5719886173707, 10.4297995361418,
10.4297995361418, 9.82419166595324, 9.58008483176486, 9.56958006531909,
9.75731607717771), acc_mag_max = c(10.4656808698978, 10.4656808698978,
10.5978974240054, 10.5978974240054, 10.2717799984467, 10.2717799984467,
10.0054693945119, 10.0054693945119), acc_mag_min = c(9.55048847884876,
9.55048847884876, 9.45791784630329, 9.45791784630329, 9.58008483176486,
9.58008483176486, 9.49389444102469, 9.49389444102469), acc_mag_avg = c(9.9181794947982,
9.9181794947982, 9.82876220923978, 9.82876220923978, 9.89351246166363,
9.89351246166363, 9.77034322149792, 9.77034322149792), vel_ang_mag = c(0.394724572535758,
0.630514095219792, 0.261846355511019, 0.243985821544114,
0.206052505577139, 0.382714007838398, 0.324438496782347,
0.339625377757329), vel_ang_mag_max = c(0.665292823798622,
0.665292823798622, 1.00730683166191, 1.00730683166191, 0.561349818527019,
0.561349818527019, 0.445252333070234, 0.445252333070234),
vel_ang_mag_min = c(0.212944405199931, 0.212944405199931,
0.18680382123856, 0.18680382123856, 0.111795327479332, 0.111795327479332,
0.258342546774667, 0.258342546774667), vel_ang_mag_avg = c(0.440700089033948,
0.440700089033948, 0.405484992593493, 0.405484992593493,
0.284553957549617, 0.284553957549617, 0.348811700631375,
0.348811700631375)), .Names = c("press_id", "start_time",
"end_time", "timestamp", "acc_x", "acc_y", "acc_z", "grav_x",
"grav_y", "grav_z", "gyro_x", "gyro_y", "gyro_z", "acc_mag",
"acc_mag_max", "acc_mag_min", "acc_mag_avg", "vel_ang_mag", "vel_ang_mag_max",
"vel_ang_mag_min", "vel_ang_mag_avg"), row.names = c(NA, -8L), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"), vars = "press_id", drop = TRUE, indices = list(
0:1, 2:3, 4:5, 6:7), group_sizes = c(2L, 2L, 2L, 2L), biggest_group_size = 2L, labels = structure(list(
press_id = 1:4), row.names = c(NA, -4L), class = "data.frame", vars = "press_id", drop = TRUE, indices = list(
0:1, 2:3, 4:5, 6:7), group_sizes = c(2L, 2L, 2L, 2L), biggest_group_size = 2L, labels = structure(list(
press_id = 1:4), row.names = c(NA, -4L), class = "data.frame", vars = "press_id", drop = TRUE, .Names = "press_id"), .Names = "press_id"))
And I am trying to summarize it in the following way where the last columns(the blank are filled with their appropriate values from above dataframe):
press_id time_state time_state_val acc_mag acc_mag_max acc_mag_min acc_mag_avg vel_ang_mag vel_ang_mag_max vel_ang_mag_min vel_ang_mag_avg
1 start_time 164429106370978
1 end_time 164429182443824
2 start_time 164429411618824
2 end_time 164429512525747
3 start_time 164429837271939
3 end_time 164429903243169
4 start_time 164430399454284
4 end_time 164430427558256
Please advise how can I transform it to be like expected result.
I am trying to do this with combination of tidyr gather and dplyr but I don't get the structure I need.

library(dplyr)
library(tidyr)
df1 <- df[,1:6]
df1 %>% mutate(row=row_number()) %>%
gather(time_state , time_state_val, -press_id, -row,-timestamp:-acc_y) %>%
arrange(press_id, row) %>%
select(press_id, time_state, time_state_val, everything(),-row)

Related

Conditionally adding characters to new column based on separate dataset

Hello all and thank you in advance.
I would like to add a new column to my pre-existing data frame where the values sourced from a second data frame based on certain conditions. The dataset I wish to add the new column to ("data_melt") has many different sample IDs (sample.#) under the variable column. Using a second dataset ("metadata") I want to add the pond names to the "data_melt" new column based on the sample-ids. The sample IDs are the same in both datasets.
My gut tells me there's an obvious solution but my head is pretty fried. Here is a toy example of my data_melt df (since its 25,000 observations):
> dput(toy)
structure(list(gene = c("serA", "mdh", "fdhB", "fdhA"), process = structure(c(1L,
1L, 1L, 1L), .Label = "energy", class = "factor"), category = structure(c(1L,
1L, 1L, 1L), .Label = "metabolism", class = "factor"), ko = structure(1:4, .Label = c("K00058",
"K00093", "K00125", "K00148"), class = "factor"), variable = structure(c(1L,
2L, 3L, 3L), .Label = c("sample.10", "sample.19", "sample.72"
), class = "factor"), value = c(0.00116, 2.77e-05, 1.84e-05,
0.0125)), row.names = c(NA, -4L), class = "data.frame")
And here is a toy example of my metadata df:
> dput(toy)
structure(list(sample = c("sample.10", "sample.19", "sample.72",
"sample.13"), pond = structure(c(2L, 2L, 1L, 1L), .Label = c("lower",
"upper"), class = "factor")), row.names = c(NA, -4L), class = "data.frame")
Thank you again!

We can use match from base R to create a numeric index to replace the values
toy$pond <- with(toy, out$pond[match(variable, out$sample)])

I believe merge will work here.
sss <- structure(list(gene = c("serA", "mdh", "fdhB", "fdhA"), process = structure(c(1L,
1L, 1L, 1L), .Label = "energy", class = "factor"), category = structure(c(1L,
1L, 1L, 1L), .Label = "metabolism", class = "factor"), ko = structure(1:4, .Label = c("K00058",
"K00093", "K00125", "K00148"), class = "factor"), variable = structure(c(1L,
2L, 3L, 3L), .Label = c("sample.10", "sample.19", "sample.72"
), class = "factor"), value = c(0.00116, 2.77e-05, 1.84e-05,
0.0125)), row.names = c(NA, -4L), class = "data.frame")
ss <- structure(list(sample = c("sample.10", "sample.19", "sample.72",
"sample.13"), pond = structure(c(2L, 2L, 1L, 1L), .Label = c("lower",
"upper"), class = "factor")), row.names = c(NA, -4L), class = "data.frame")
ssss <- merge(sss, ss, by.x = "variable", by.y = "sample")

You can use left_join() from the dplyr package after renaming sample to variable in the metadata data frame.
library(tidyverse)
data_melt <- structure(list(gene = c("serA", "mdh", "fdhB", "fdhA"),
process = structure(c(1L, 1L, 1L, 1L),
.Label = "energy",
class = "factor"),
category = structure(c(1L, 1L, 1L, 1L),
.Label = "metabolism",
class = "factor"),
ko = structure(1:4,
.Label = c("K00058", "K00093", "K00125", "K00148"),
class = "factor"),
variable = structure(c(1L, 2L, 3L, 3L),
.Label = c("sample.10", "sample.19", "sample.72"),
class = "factor"),
value = c(0.00116, 2.77e-05, 1.84e-05, 0.0125)),
row.names = c(NA, -4L),
class = "data.frame")
metadata <- structure(list(sample = c("sample.10", "sample.19", "sample.72", "sample.13"),
pond = structure(c(2L, 2L, 1L, 1L),
.Label = c("lower", "upper"),
class = "factor")),
row.names = c(NA, -4L),
class = "data.frame") %>%
# Renaming the column, so we can join the two data sets together
rename(variable = sample)
data_melt <- data_melt %>%
left_join(metadata, by = "variable")

how to reshape the matrix and fill the missing value as 0

I have a question about matrix structure manipulation in R, here I need to first transpose the matrix and combine the month and status columns, filling the missing values with 0. Here I have an example, currently my data is like belows. It seems very tricky. I would appreciate if anyone could help on this. Thank you.
Hi, my data looks like the follows:
structure(list(Customer = c("1096261", "1096261", "1169502",
"1169502"), Phase = c("2", "3", "1", "2"), Status = c("Ontime",
"Ontime", "Ontime", "Ontime"), Amount = c(21216.32, 42432.65,
200320.05, 84509.24)), .Names = c("Customer", "Phase", "Status",
"Amount"), row.names = c(NA, -4L), class = c("grouped_df", "tbl_df",
"tbl", "data.frame"), vars = c("Customer", "Phase"), drop = TRUE, indices
= list(
0L, 1L, 2L, 3L), group_sizes = c(1L, 1L, 1L, 1L), biggest_group_size = 1L,
labels = structure(list(
Customer = c("1096261", "1096261", "1169502", "1169502"),
Phase = c("2", "3", "1", "2")), row.names = c(NA, -4L), class =
"data.frame", vars = c("Customer",
"Phase"), drop = TRUE, .Names = c("Customer", "Phase")))
I need to have the reshaped matrix with the following columns:
Customer Phase1earlyTotal Phase2earlyTotal....Phase4earlyTotal...Phase1_ Ontimetotal...Phase4_Ontimetotal...Phase1LateTotal_Phase4LateTotal. For example Phase1earlytotal includes the sum of the amount with the Phase=1 and Status=Early.
Currently I use the following scripts, which does not work, coz I dont know
how to combine Phase and Stuatus Column.
mydata2<-data.table(mydata2,V3,V4)
mydata2$V4<-NULL
datacus <- data.frame(mydata2[-1,],stringsAsFactors = F);
datacus <- datacus %>% mutate(Phase= as.numeric(Phase),Amount=
as.numeric(Amount)) %>%
complete(Phase = 1:4,fill= list(Amount = 0)) %>%
dcast(datacus~V3, value.var = 'Amount',fill = 0) %>% select(Phase, V3)
%>%t()

I believe you are looking for somethink like this?
sample data
df <- structure(list(Customer = c("1096261", "1096261", "1169502",
"1169502"), Phase = c("2", "3", "1", "2"), Status = c("Ontime",
"Ontime", "Ontime", "Ontime"), Amount = c(21216.32, 42432.65,
200320.05, 84509.24)), .Names = c("Customer", "Phase", "Status",
"Amount"), row.names = c(NA, -4L), class = c("grouped_df", "tbl_df",
"tbl", "data.frame"), vars = c("Customer", "Phase"), drop = TRUE, indices
= list(
0L, 1L, 2L, 3L), group_sizes = c(1L, 1L, 1L, 1L), biggest_group_size = 1L,
labels = structure(list(
Customer = c("1096261", "1096261", "1169502", "1169502"),
Phase = c("2", "3", "1", "2")), row.names = c(NA, -4L), class =
"data.frame", vars = c("Customer",
"Phase"), drop = TRUE, .Names = c("Customer", "Phase")))
# Customer Phase Status Amount
# 1: 1096261 2 Ontime 21216.32
# 2: 1096261 3 Ontime 42432.65
# 3: 1169502 1 Ontime 200320.05
# 4: 1169502 2 Ontime 84509.24
code
library( data.table )
dcast( setDT( df ), Customer ~ Phase + Status, fun = sum, value.var = "Amount" )[]
output
# Customer 1_Ontime 2_Ontime 3_Ontime
# 1: 1096261 0 21216.32 42432.65
# 2: 1169502 200320 84509.24 0.00

ggplot add aggregated summaries to a bar plot

I have the following data frame:
structure(list(StepsGroup = structure(c(1L, 1L, 1L, 2L, 2L, 2L,
3L, 3L, 3L), .Label = c("(-Inf,3e+03]", "(3e+03,1.2e+04]", "(1.2e+04, Inf]"
), class = "factor"), GlucoseGroup = structure(c(1L, 2L, 3L,
1L, 2L, 3L, 1L, 2L, 3L), .Label = c("<100", "100-180", ">180"
), class = "factor"), n = c(396L, 1600L, 229L, 787L, 4182L, 375L,
110L, 534L, 55L), freq = c(0.177977528089888, 0.719101123595506,
0.102921348314607, 0.147267964071856, 0.782559880239521, 0.0701721556886228,
0.157367668097282, 0.763948497854077, 0.0786838340486409)), class =
c("grouped_df",
"tbl_df", "tbl", "data.frame"), row.names = c(NA, -9L), vars = "StepsGroup",
labels = structure(list(
StepsGroup = structure(1:3, .Label = c("(-Inf,3e+03]", "(3e+03,1.2e+04]",
"(1.2e+04, Inf]"), class = "factor")), class = "data.frame", row.names =
c(NA, -3L), vars = "StepsGroup", drop = TRUE), indices = list(0:2,
3:5, 6:8), drop = TRUE, group_sizes = c(3L, 3L, 3L), biggest_group_size =
3L)
I would like to create a stacked bar plot, and add a summary of each StepsGroup on top of each bar. So the first group will have 2225, the second 5344 and the third 699.
I am using the following script:
ggplot(d_stepsFastingSummary , aes(y = freq, x = StepsGroup, fill =
GlucoseGroup)) + geom_bar(stat = "identity") +
geom_text(aes(label = sum(n()), vjust = 0))
The part until before the geom_text works, but for the last bit I get the following error:
Error: This function should not be called directly
Any idea how to add the aggregated quantity?

We could create a new dataframe stacked_df which would have sum for each StepsGroup
stacked_df <- df %>% group_by(StepsGroup) %>% summarise(nsum = sum(n))
ggplot(df) +
geom_bar(aes(y = freq, x = StepsGroup, fill= GlucoseGroup),stat = "identity") +
geom_text(data = stacked_df, aes(label = nsum, StepsGroup,y = 1.1))

R Wide to long format for multiple variables with patterns [duplicate]

This question already has answers here:
Reshaping multiple sets of measurement columns (wide format) into single columns (long format)
(8 answers)
Closed 4 years ago.
I have a data set with a single identifier and five columns that repeat 18 times. I want to restructure the data into long format keeping the first five column headings as the column headings. Below is a sample with just two repeats:
structure(list(Response.ID = 1:2, Task = structure(c(1L, 1L), .Label = "task1", class = "factor"),
Freq = structure(c(1L, 1L), .Label = "Daily", class = "factor"),
Hours = c(3L, 2L), Value = c(10L, 8L), Mood = structure(1:2, .Label = c("Engaged",
"Neutral"), class = "factor"), Task.1 = structure(c(1L, 1L
), .Label = "task2", class = "factor"), Freq.1 = structure(c(1L,
1L), .Label = "Weekly", class = "factor"), Hours.1 = c(4L,
4L), Value.1 = c(10L, 6L), Mood.1 = structure(c(2L, 1L), .Label = c("Neutral",
"Optimistic"), class = "factor")), .Names = c("Response.ID", "Task", "Freq", "Hours", "Value", "Mood", "Task.1", "Freq.1", "Hours.1", "Value.1", "Mood.1"), class = "data.frame", row.names = c(NA, -2L))
I attempted using the melt and patterns functions, which appears to approximate my desired outcome without the desired column headings:
df = melt(df1, id.vars = c("Response.ID"), measure.vars = patterns("^Task", "^Freq","^Hours","^Mood"))
Here is the result:
structure(list(Response.ID = c(1L, 2L, 1L, 2L), variable = structure(c(1L, 1L, 2L, 2L), class = "factor", .Label = c("1", "2")), value1 = c("task1", "task1", "task2", "task2"), value2 = c("Daily", "Daily", "Weekly", "Weekly"), value3 = c(3L, 2L, 4L, 4L), value4 = c("Engaged", "Neutral", "Optimistic", "Neutral")), .Names = c("Response.ID", "variable", "value1", "value2", "value3", "value4"), row.names = c(NA, -4L), class = c("data.table", "data.frame"), .internal.selfref = <pointer: 0x0000000000330788>)
When I tried to specify names with value.name() below I receive an error:
df = melt(df1, id.vars = c("Response.ID"),measure.vars = patterns("^Task", "^Freq","^Hours","^Mood"), value.name=c("Task", "Freq", "Hours", "Value","Mood"))
My desired result would look like this:
structure(list(Response.ID = c(1L, 2L, 1L, 2L), Task = structure(c(1L, 1L, 2L, 2L), .Label = c("task1", "task2"), class = "factor"),
Freq = structure(c(1L, 1L, 2L, 2L), .Label = c("Daily", "Weekly"
), class = "factor"), Hours = c(3L, 2L, 4L, 4L), Value = c(10L,
8L, 10L, 6L), Mood = structure(c(1L, 2L, 3L, 2L), .Label = c("Engaged",
"Neutral", "Optimistic"), class = "factor")), .Names = c("Response.ID", "Task", "Freq", "Hours", "Value", "Mood"), class = "data.frame", row.names = c(NA, -4L))

It looks to me like you embarked on a difficult journey by using melt: this function is well named in the sense that trying to use it will probably melt your brain. Joke aside, the function melt has lots of underlying computations and its use could be inefficient if you have a large dataset.
I would instead solve the problem manually with rbindlist (from the excellent package data.table, which also ships with an optimized version of melt if you really want to use it), to manually concatenates groups of columns. This also preserves the column names:
> rbindlist(lapply(1:2, function(i) df1[,c(1,((i-1)*5+2):((i-1)*5+6))]))
Response.ID Task Freq Hours Value Mood
1: 1 task1 Daily 3 10 Engaged
2: 2 task1 Daily 2 8 Neutral
3: 1 task2 Weekly 4 10 Optimistic
4: 2 task2 Weekly 4 6 Neutral
This works on your example: replace the indices 1:2 by the number of repetitions to make it work with the real dataset (so, lapply(1:18)).

Collapse and aggregate several row values by date

I've got a data set that looks like this:
date, location, value, tally, score
2016-06-30T09:30Z, home, foo, 1,
2016-06-30T12:30Z, work, foo, 2,
2016-06-30T19:30Z, home, bar, , 5
I need to aggregate these rows together, to obtain a result such as:
date, location, value, tally, score
2016-06-30, [home, work], [foor, bar], 3, 5
There are several challenges for me:
The resulting row (a daily aggregate) must include the rows for this day (2016-06-30 in my above example
Some rows (strings) will result in an array containing all the values present on this day
Some others (ints) will result in a sum
I've had a look at dplyr, and if possible I'd like to do this in R.
Thanks for your help!
Edit:
Here's a dput of the data
structure(list(date = structure(1:3, .Label = c("2016-06-30T09:30Z",
"2016-06-30T12:30Z", "2016-06-30T19:30Z"), class = "factor"),
location = structure(c(1L, 2L, 1L), .Label = c("home", "work"
), class = "factor"), value = structure(c(2L, 2L, 1L), .Label = c("bar",
"foo"), class = "factor"), tally = c(1L, 2L, NA), score = c(NA,
NA, 5L)), .Names = c("date", "location", "value", "tally",
"score"), class = "data.frame", row.names = c(NA, -3L))

mydat<-structure(list(date = structure(1:3, .Label = c("2016-06-30T09:30Z",
"2016-06-30T12:30Z", "2016-06-30T19:30Z"), class = "factor"),
location = structure(c(1L, 2L, 1L), .Label = c("home", "work"
), class = "factor"), value = structure(c(2L, 2L, 1L), .Label = c("bar",
"foo"), class = "factor"), tally = c(1L, 2L, NA), score = c(NA,
NA, 5L)), .Names = c("date", "location", "value", "tally",
"score"), class = "data.frame", row.names = c(NA, -3L))
mydat$date <- as.Date(mydat$date)
require(data.table)
mydat.dt <- data.table(mydat)
mydat.dt <- mydat.dt[, lapply(.SD, paste0, collapse=" "), by = date]
cbind(mydat.dt, aggregate(mydat[,c("tally", "score")], by=list(mydat$date), FUN = sum, na.rm=T)[2:3])
which gives you:
date location value tally score
1: 2016-06-30 home work home foo foo bar 3 5
Note that if you wanted to you could probably do it all in one step in the reshaping of the data.table but I found this to be a quicker and easier way for me to achieve the same thing in 2 steps.

Develop Reference

r css asp.net wordpress firebase qt symfony nginx http apache-flex

Summarize dataframe with start and end times in R? - r

library(dplyr) library(tidyr) df1 <- df[,1:6] df1 %>% mutate(row=row_number()) %>% gather(time_state , time_state_val, -press_id, -row,-timestamp:-acc_y) %>% arrange(press_id, row) %>% select(press_id, time_state, time_state_val, everything(),-row)

Related

Conditionally adding characters to new column based on separate dataset

how to reshape the matrix and fill the missing value as 0

ggplot add aggregated summaries to a bar plot

R Wide to long format for multiple variables with patterns [duplicate]

Collapse and aggregate several row values by date

Categories

Resources