I have timeseries data from 2 sensors recoding independently. They were both started at different start times and record data at different intervals. Sensor 1 records every one second while sensor 2 records every 2 seconds. I want to combine both these datasets into a single dataframe in order to ggplot. Can someone help me out? If there are other better options than ggplot and dataframes, please let me know. Thanks for the help. I included sample data (not actual, let me know if I did not include the right sample under):
dput(reading1)
structure(list(time = structure(c(-2209030842, -2209030841, -2209030840,
-2209030839, -2209030838, -2209030837, -2209030836, -2209030835,
-2209030834, -2209030833, -2209030832, -2209030831, -2209030830,
-2209030829, -2209030828, -2209030827, -2209030826, -2209030825,
-2209030824), class = c("POSIXct", "POSIXt"), tzone = "UTC"),
`reading 1` = c(0.004, 0.003, 0.003, 0.013, 0.021, 0.008,
0.004, 0.005, 0.004, 0.007, 0.003, 0.004, 0.002, 0.003, 0.004,
0.004, 0.005, 0.001, 0.003)), class = c("tbl_df", "tbl",
"data.frame"), row.names = c(NA, -19L))
dput(reading2)
structure(list(Date = structure(c(-2209031012, -2209031009, -2209031007,
-2209031005, -2209031003, -2209030982, -2209030981, -2209030976,
-2209030974, -2209030972, -2209030970, -2209030949, -2209030882,
-2209030879, -2209030877, -2209030875, -2209030873, -2209030871,
-2209030850, -2209030849, -2209030838, -2209030816, -2209030814,
-2209030811, -2209030808, -2209030806, -2209030804, -2209030783,
-2209030782, -2209030780, -2209030778, -2209030775, -2209030773,
-2209030771, -2209030750, -2209030749, -2209030747, -2209030742,
-2209030740, -2209030738, -2209030717, -2209030705, -2209030684,
-2209030683, -2209030681, -2209030679, -2209030676, -2209030674,
-2209030672, -2209030651, -2209030650, -2209030648, -2209030646,
-2209030644, -2209030641, -2209030639), class = c("POSIXct",
"POSIXt"), tzone = "UTC"), `reading 2` = c(8, 8, 8, 8, 8, 6,
6, 8, 8, 8, 8, 6, 7, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6,
5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 6,
6, 6, 6, 6, 6, 6, 6, 6)), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -56L))
You could combine them using dplyr like this:
combined <- bind_rows(
reading1 %>% rename(reading = `reading 1`) %>% mutate(sensor = 1),
reading2 %>% rename(reading = `reading 2`, time = Date) %>%
mutate(sensor = 2)) %>%
arrange(time)
combined
#> # A tibble: 75 x 3
#> time reading sensor
#> <dttm> <dbl> <dbl>
#> 1 1899-12-31 12:16:28 8 2
#> 2 1899-12-31 12:16:31 8 2
#> 3 1899-12-31 12:16:33 8 2
#> 4 1899-12-31 12:16:35 8 2
#> 5 1899-12-31 12:16:37 8 2
#> 6 1899-12-31 12:16:58 6 2
#> 7 1899-12-31 12:16:59 6 2
#> 8 1899-12-31 12:17:04 8 2
#> 9 1899-12-31 12:17:06 8 2
#> 10 1899-12-31 12:17:08 8 2
#> # ... with 65 more rows
Having your data in long format like this allows for easier plotting, for example:
library(ggplot2)
ggplot(combined, aes(time, reading, color = factor(sensor))) +
geom_line(size = 1) +
theme_bw(base_size = 16) +
scale_color_brewer(palette = "Set1", name = "Sensor")
Created on 2022-05-19 by the reprex package (v2.0.1)
Related
I have a list named df which contains three iterations with two years of projection.
What I want is: weighting the variable "district" just for year 2 in each iteration and finally I want to have mean of each weighted district for all three iterations. Note that each year has a variable named "weight" that weighting should be based on this variable.
iteration1 <- list(year1 = data.frame(age = c(10, 11, 12, 13),
district = c(1, 2, 3, 4),
gender = c(1, 2, 2, 1),
weight = c(12.2, 11.3, 11.2, 10.1)),
year2 = data.frame(age = c(10, 11, 12, 13, 10, 10),
district = c(1, 2, 3, 4, 2, 1),
gender = c(1, 2, 2, 1, 1, 1),
weight = c(12.2, 11.3, 11.2, 10.1, 12.2, 13.1)))
iteration2 <- list(year1 = data.frame(age = c(10, 11, 12, 13),
district = c(1, 2, 3, 4),
gender = c(2, 2, 1, 1),
weight = c(12.2, 11.3, 11.2, 10.1)),
year2 = data.frame(age = c(10, 11, 12, 13, 13, 13, 12),
district = c(1, 2, 3, 4, 1, 3, 3),
gender = c(2, 2, 1, 1, 2, 2, 2),
weight = c(12.2, 11.3, 11.2, 10.1, 10.9, 11.9, 15.1)))
iteration3 <- list(year1 = data.frame(age = c(10, 11, 12, 13),
district = c(1, 2, 3, 4),
gender = c(2, 2, 1, 1),
weight = c(12.2, 11.3, 11.2, 10.1)),
year2 = data.frame(age = c(10, 11, 12, 13, 10, 10, 11, 12),
district = c(1, 2, 3, 4, 4, 3, 2, 2),
gender = c(2, 2, 1, 1, 2, 2, 1, 2),
weight = c(12.2, 11.3, 11.2, 10.1, 13.5, 12.8, 13.9, 14.9)))
df <- list(iteration1 = iteration1, iteration2 = iteration2, iteration3 = iteration3)
Expected output:
district mean of each district for all three iterations
1 20.2
2 24.96
3 24.46
4 14.6
for calculating my expected output I have followed two steps. in first stepŘ I have weighted year 2 in each iteration by wtd.table(df$iteration1$year2$district,weights=df$iteration1$year2$weight) . I repeated this code for three times (because I have three iterations). here is my output:
1 2 3 4
25.3 23.5 11.2 10.1
1 2 3 4
23.1 11.3 38.2 10.1
1 2 3 4
12.2 40.1 24.0 23.6
in second step, I calculate mean of each district for three iterations manually: mean(25.3,23.1,12.2)
data.table approach
library(data.table)
library(questionr)
ans <- rbindlist(
lapply(df, function(x)
as.data.table(
questionr::wtd.table(x[["year2"]]$district,
weights = x[["year2"]]$weight))),
use.names = TRUE, fill = TRUE)
# Summarise
ans[, .(weight = mean(N, na.rm = TRUE)), by = .(district = V1)]
# district weight
# 1: 1 20.20000
# 2: 2 24.96667
# 3: 3 24.46667
# 4: 4 14.60000
Version 2
With updated columns based on TS's comment below
ans <- rbindlist(
lapply(df, function(x)
as.data.table(
questionr::wtd.table(x = x[["year2"]]$district,
y = x[["year2"]]$gender,
weights = x[["year2"]]$weight) ) ),
use.names = TRUE, fill = TRUE )
# Summarise
ans[, .(n = .N,
mean = mean(N, na.rm = TRUE),
sd = sd(N, na.rm = TRUE)),
by = .(district = V1, gender = V2)]
# district gender n mean sd
# 1: 1 1 3 8.433333 14.606962
# 2: 2 1 3 8.700000 7.582216
# 3: 3 1 3 7.466667 6.466323
# 4: 4 1 3 10.100000 0.000000
# 5: 1 2 3 11.766667 11.556095
# 6: 2 2 3 16.266667 8.602519
# 7: 3 2 3 17.000000 8.697126
# 8: 4 2 3 4.500000 7.794229
Combine the list of dataframes into one and calculate average weight using questionr::wtd.table for each district and iteration in year2. Finally, get aggregated mean for each district.
Using tidyverse you can do -
library(dplyr)
library(purrr)
map_df(df, ~bind_rows(.x, .id = 'year'), .id = 'iter') %>%
filter(year == 'year2') %>%
group_by(district, iter) %>%
summarise(result = questionr::wtd.table(district,weights=weight)) %>%
summarise(result = mean(result))
# district result
# <dbl> <dbl>
#1 1 20.2
#2 2 25.0
#3 3 24.5
#4 4 14.6
I am trying to sort my data in descending or ascending order regardless of the data in the rows. I made a dummy example below:
A <- c(9,9,5,4,6,3,2,NA)
B <- c(9,5,3,4,1,4,NA,NA)
C <- c(1,4,5,6,7,4,2,4)
base <- data.frame(A,B,C)
df <- base
df$A <- sort(df$A,na.last = T)
df$B <- sort(df$B,na.last = T)
df$C <- sort(df$C)
We get this
structure(list(A = c(2, 3, 3, 4, 4, 4, 5, 5, 6, 9, 9, NA), B = c(1,
2, 3, 4, 4, 4, 5, 5, 9, 10, NA, NA), C = c(1, 2, 3, 4, 4, 4,
5, 5, 6, 7, 8, 8)), row.names = c(NA, -12L), class = "data.frame")
I want to get something similar to df but my data have hundreds of columns, is there an easier way to do it?
I tried arrange_all() but the result is not what i want.
library(tidyverse)
test <- base%>%
arrange_all()
Obtaining this:
structure(list(A = c(2, 3, 3, 4, 4, 4, 5, 5, 6, 9, 9, NA), B = c(NA,
2, 4, 4, 5, 10, 3, 4, 1, 5, 9, NA), C = c(2, 3, 4, 6, 8, 5, 5,
8, 7, 4, 1, 4)), class = "data.frame", row.names = c(NA, -12L
))
You can sort each column individually :
library(dplyr)
base %>% mutate(across(.fns = sort, na.last = TRUE))
# A B C
#1 2 1 1
#2 3 3 2
#3 4 4 4
#4 5 4 4
#5 6 5 4
#6 9 9 5
#7 9 NA 6
#8 NA NA 7
Or in base R :
base[] <- lapply(base, sort, na.last = TRUE)
I have the following data frame:
structure(list(C1 = c(1, 2, 2, 3, 4, 5, 5, 6), C2 = c(3.5, 3,
2.5, 2, 3, 2, 3, 5), C3 = c(6.5, 8, 9, 5, 7, 4, 3, 6)), row.names = c(NA,
-8L), class = c("tbl_df", "tbl", "data.frame"))
The first column is an index. The first observation is characterised by 1 point, the second by 2 points.
I need to make the intersection of all combinations of observations, one way. The result creates a new dataframe with a new index, with again some observations that are characterised by 2 rows/points: 1-2, 1-3, 1-4, 1-5, 1-6, 2-3, 2-4, 2-5, 2-6, 3-4, 3-5, 3-6, 4-5, 4-6, 5-6:
df2 = structure(list(C1 = c(1, 2, 3, 4, 4, 5, 6,7, 8, 8, 9, 10, 11, 11, 12, 13, 13, 14, 15, 15), C2 = c(3,2,3,3,2,3.5,2,3,2,3,3,2,3,2,2,2,3,3,2,3), C3 = c(6.5,5,6.5,3,4,6,5,7,4,3,6,5,3,4,5,4,3,6,4,3)), row.names = c(NA,
-20L), class = c("tbl_df", "tbl", "data.frame"))
where 3 in the first column is the new observation created by intersecting the 2 former.
I though I could use pmin in each row but it does not work. Can somenone tackle this?
I am not sure if the code is the thing you want, where cummin() is used
df2 <- cbind(df[1],cummin(df[-1]))
> df2
C1 C2 C3
1 1 3.5 6.5
2 2 3.0 6.5
3 2 2.5 6.5
4 3 2.0 5.0
5 4 2.0 5.0
6 5 2.0 4.0
7 5 2.0 3.0
8 6 2.0 3.0
DATA
df <- structure(list(C1 = c(1, 2, 2, 3, 4, 5, 5, 6), C2 = c(3.5, 3,
2.5, 2, 3, 2, 3, 5), C3 = c(6.5, 8, 9, 5, 7, 4, 3, 6)), row.names = c(NA,
-8L), class = c("tbl_df", "tbl", "data.frame"))
This question already has answers here:
How collect additional row data on binned data in R
(1 answer)
Group value in range r
(3 answers)
Closed 3 years ago.
I am doing a statistic analysis in a big data frame (more than 48.000.000 rows) in r. Here is an exemple of the data:
structure(list(herd = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3), cows = c(1, 2,
3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 1, 2, 3, 4,
5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 1, 2, 3, 4, 5, 6,
7, 8, 9, 10, 11, 12, 13, 14, 15, 16), `date` = c("11/03/2013",
"12/03/2013", "13/03/2013", "14/03/2013", "15/03/2013", "16/03/2013",
"13/05/2012", "14/05/2012", "15/05/2012", "16/05/2012", "17/05/2012",
"18/05/2012", "10/07/2016", "11/07/2016", "12/07/2016", "13/07/2016",
"11/03/2013", "12/03/2013", "13/03/2013", "14/03/2013", "15/03/2013",
"16/03/2013", "13/05/2012", "14/05/2012", "15/05/2012", "16/05/2012",
"17/05/2012", "18/05/2012", "10/07/2016", "11/07/2016", "12/07/2016",
"13/07/2016", "11/03/2013", "12/03/2013", "13/03/2013", "14/03/2013",
"15/03/2013", "16/03/2013", "13/05/2012", "14/05/2012", "15/05/2012",
"16/05/2012", "17/05/2012", "18/05/2012", "10/07/2016", "11/07/2016",
"12/07/2016", "13/07/2016"), glicose = c(240666, 23457789, 45688688,
679, 76564, 6574553, 78654, 546432, 76455643, 6876, 7645432,
876875, 98654, 453437, 98676, 9887554, 76543, 9775643, 986545,
240666, 23457789, 45688688, 679, 76564, 6574553, 78654, 546432,
76455643, 6876, 7645432, 876875, 98654, 453437, 98676, 9887554,
76543, 9775643, 986545, 240666, 23457789, 45688688, 679, 76564,
6574553, 78654, 546432, 76455643, 6876)), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -48L))
I need to identify how many cows are in the following category of glicose by herd and by date:
<=100000
100000 and <=150000
150000 and <=200000
200000 and <=250000
250000 and <=400000
>400000
I tried to use the functions filter() and select() but could not categorize the variable like that.
I tried either to make a vector for each category but it did not work:
ht <- df %>% group_by(herd, date) %>%
filter(glicose < 100000)
Actually I do not have a clue of how I could do this. Please help!
I expect to get the number of cows in each category of each herd based on each date in a table like this:
Calling your data df,
df %>%
mutate(glicose_group = cut(glicose, breaks = c(0, seq(1e5, 2.5e5, by = 0.5e5), 4e5, Inf)),
date = as.Date(date, format = "%d/%m/%Y")) %>%
group_by(herd, date, glicose_group) %>%
count
# # A tibble: 48 x 4
# # Groups: herd, date, glicose_group [48]
# herd date glicose_group n
# <dbl> <date> <fct> <int>
# 1 1 2012-05-13 (0,1e+05] 1
# 2 1 2012-05-14 (4e+05,Inf] 1
# 3 1 2012-05-15 (4e+05,Inf] 1
# 4 1 2012-05-16 (0,1e+05] 1
# 5 1 2012-05-17 (4e+05,Inf] 1
# 6 1 2012-05-18 (4e+05,Inf] 1
# 7 1 2013-03-11 (2e+05,2.5e+05] 1
# 8 1 2013-03-12 (4e+05,Inf] 1
# 9 1 2013-03-13 (4e+05,Inf] 1
# 10 1 2013-03-14 (0,1e+05] 1
# # ... with 38 more rows
I also threw in a conversion to Date class, which is probably a good idea.
I have imported a .sav file with Haven but where I am stuck is that I cant seem to work out how to print the label names in place or, with the label codings. Labels: 1 = unemployed, 2 = looking etc.
Employment <- select(well_being_df, EmploymentStatus, Gender) %>% <group_by(EmploymentStatus) %>% summarise_all(funs(mean, n = n(), sd,min(.,is.na = TRUE), max(.,is.na = TRUE)))
# A tibble: 5 x 6
EmploymentStatus mean n sd min max
<dbl+lbl> <dbl> <int> <dbl> <dbl> <dbl>
1 1 1.67 12 0.492 1 2
2 2 1.17 6 0.408 1 2
3 3 1.8 85 0.431 1 3
4 4 1.5 62 0.504 1 2
5 5 1.5 4 0.577 1 2
Ideally:
# A tibble: 5 x 6
EmploymentStatus mean n sd min max
<dbl+lbl> <dbl> <int> <dbl> <dbl> <dbl>
1 1 Unemployed 1.67 12 0.492 1 2
2 2 Looking 1.17 6 0.408 1 2
3 3 Etc 1.8 85 0.431 1 3
4 4 1.5 62 0.504 1 2
5 5 1.5 4 0.577 1 2
dput(head(well_being_df, 10))
structure(list(Age = c(22, 20, 23, 20, 25, 18, 24, 21, 21, 30.7344197070233
), Gender = structure(c(2, 2, 1, 2, 1, 2, 2, 2, 2, 1), labels = c(Male = 1,
Female = 2, Transgender = 3), class = "labelled"), EmploymentStatus = structure(c(3,
1, 4, 3, 3, 3, 3, 4, 3, 4), labels = c(`Unemployed but not looking` = 1,
`Unemployed and looking` = 2, `Part-time` = 3, `Full-time` = 4,
Retired = 5), class = "labelled"), Cognition1 = structure(c(6,
3, 6, 5, 9, 6, 4, 4, 7, 5), labels = c(`Provides nothing that you want` = 0,
`Provides half of what you want` = 5, `Provides all that you want` = 10
), class = "labelled"), Cognition2 = structure(c(7, 3, 8,
5, 8, 5, 5, 7, 7, 3), labels = c(`Far below average` = 0,
`About Average` = 5, `Far above average` = 10), class = "labelled"),
Cognition3 = structure(c(6, 5, 4, 5, 6, 5, 5, 5, 5, 5), labels = c(`Far less than you deserve` = 0,
`About what you deserve` = 5, `Far more than you deserve` = 10
), class = "labelled"), Cognition4 = structure(c(7, 3, 6,
2, 8, 3, 3, 5, 6, 2), labels = c(`Far less than you need` = 0,
`About what you need` = 5, `Far more than you need` = 10), class = "labelled"),
Cognition5 = structure(c(10, 9, 6, 3, 7, 2, 2, 0, 4, 0), labels = c(`Far less than expected` = 0,
`About as expected` = 5, `Far more than expected` = 10), class = "labelled"),
Cognition6 = structure(c(8, 6, 0, 3, 3, 8, 9, 10, 5, 10), labels = c(`Far more than it will in the future` = 0,
`About what you expect in the future` = 5, `Far less than what the future will offer` = 10
), class = "labelled"), Cognition7 = structure(c(9, 7, 10,
5, 6, 2, 3, 0, 8, 3), labels = c(`Far below previous best` = 0,
`Equals previous best` = 5, `Far above previous best` = 10
), class = "labelled")), row.names = c(NA, -10L), class = c("tbl_df",
"tbl", "data.frame"))
Employment <- select(well_being_df, EmploymentStatus, Gender) %>%
mutate(EmploymentStatus = labelled::to_factor(EmploymentStatus)) %>% # use labelled package
group_by(EmploymentStatus) %>%
summarise_all(funs(mean, n = n(), sd,min(.,is.na = TRUE), max(.,is.na = TRUE)))