I have a data set which has gaps in one of the columns (temp). I am trying to fill the gaps using the "temp" data from a "sensor" or mean of "sensors" within the same "treatment", and of course same date stamp. I am trying to do this using tidyverse/lubridate.
date treatment sensor temp
1/01/2019 1 A 30
2/01/2019 1 A 29.1
3/01/2019 1 A 21.2
4/01/2019 1 A NA
1/01/2019 1 B 20.5
2/01/2019 1 B 19.8
3/01/2019 1 B 35.1
4/01/2019 1 B 23.5
1/01/2019 2 C 31.2
2/01/2019 2 C 32.1
3/01/2019 2 C 28.1
4/01/2019 2 C 31.2
1/01/2019 2 D NA
2/01/2019 2 D 26.5
3/01/2019 2 D 27.9
4/01/2019 2 D 28
This is what I am expecting:
date treatment sensor temp
1/01/2019 1 A 30
2/01/2019 1 A 29.1
3/01/2019 1 A 21.2
4/01/2019 1 A 23.5
1/01/2019 1 B 20.5
2/01/2019 1 B 19.8
3/01/2019 1 B 35.1
4/01/2019 1 B 23.5
1/01/2019 2 C 31.2
2/01/2019 2 C 32.1
3/01/2019 2 C 28.1
4/01/2019 2 C 31.2
1/01/2019 2 D 31.2
2/01/2019 2 D 26.5
3/01/2019 2 D 27.9
4/01/2019 2 D 28
Many thanks for your help.
Another option with na.aggregate from zoo
library(dplyr)
library(zoo)
df %>%
group_by(date, treatment) %>%
mutate(temp = na.aggregate(temp))
# A tibble: 16 x 4
# Groups: date, treatment [8]
# date treatment sensor temp
# <fct> <int> <fct> <dbl>
# 1 1/01/2019 1 A 30
# 2 2/01/2019 1 A 29.1
# 3 3/01/2019 1 A 21.2
# 4 4/01/2019 1 A 23.5
# 5 1/01/2019 1 B 20.5
# 6 2/01/2019 1 B 19.8
# 7 3/01/2019 1 B 35.1
# 8 4/01/2019 1 B 23.5
# 9 1/01/2019 2 C 31.2
#10 2/01/2019 2 C 32.1
#11 3/01/2019 2 C 28.1
#12 4/01/2019 2 C 31.2
#13 1/01/2019 2 D 31.2
#14 2/01/2019 2 D 26.5
#15 3/01/2019 2 D 27.9
#16 4/01/2019 2 D 28
data
df <- structure(list(date = structure(c(1L, 2L, 3L, 4L, 1L, 2L, 3L,
4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L), .Label = c("1/01/2019",
"2/01/2019", "3/01/2019", "4/01/2019"), class = "factor"), treatment = c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L),
sensor = structure(c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L,
3L, 3L, 3L, 4L, 4L, 4L, 4L), .Label = c("A", "B", "C", "D"
), class = "factor"), temp = c(30, 29.1, 21.2, NA, 20.5,
19.8, 35.1, 23.5, 31.2, 32.1, 28.1, 31.2, NA, 26.5, 27.9,
28)), class = "data.frame", row.names = c(NA, -16L))
How about this:
df <- df %>%
group_by(date, treatment) %>%
mutate(
fill = mean(temp, na.rm=TRUE), # value to fill in blanks
temp2 = case_when(!is.na(temp) ~ temp,
TRUE ~ fill)
)
Here is one option using map2_dbl from purrr. We group_by treatment and replace NA temp with the first non-NA temp with the same date in the group.
library(dplyr)
library(purrr)
df %>%
group_by(treatment) %>%
mutate(temp = map2_dbl(temp, date, ~if (is.na(.x))
temp[which.max(date == .y & !is.na(temp))] else .x))
# date treatment sensor temp
# <fct> <int> <fct> <dbl>
# 1 1/01/2019 1 A 30
# 2 2/01/2019 1 A 29.1
# 3 3/01/2019 1 A 21.2
# 4 4/01/2019 1 A 23.5
# 5 1/01/2019 1 B 20.5
# 6 2/01/2019 1 B 19.8
# 7 3/01/2019 1 B 35.1
# 8 4/01/2019 1 B 23.5
# 9 1/01/2019 2 C 31.2
#10 2/01/2019 2 C 32.1
#11 3/01/2019 2 C 28.1
#12 4/01/2019 2 C 31.2
#13 1/01/2019 2 D 31.2
#14 2/01/2019 2 D 26.5
#15 3/01/2019 2 D 27.9
#16 4/01/2019 2 D 28
data
df <- structure(list(date = structure(c(1L, 2L, 3L, 4L, 1L, 2L, 3L,
4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L), .Label = c("1/01/2019",
"2/01/2019", "3/01/2019", "4/01/2019"), class = "factor"), treatment =
c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L),
sensor = structure(c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L,
3L, 3L, 3L, 4L, 4L, 4L, 4L), .Label = c("A", "B", "C", "D"
), class = "factor"), temp = c(30, 29.1, 21.2, NA, 20.5,
19.8, 35.1, 23.5, 31.2, 32.1, 28.1, 31.2, NA, 26.5, 27.9,
28)), class = "data.frame", row.names = c(NA, -16L))
Related
I am trying to merge two dataframes by date in R.
The first dataframe records daily temperatures. It has only 28 rows, and no dates are repeated.
head(df1)
Day MaxTemp MinTemp
2019-06-15 23.8 14.4
2019-06-16 24.9 11.7
2019-06-17 23.2 8.7
The second dataframe records hourly temperatures, and so has many more rows, with dates repeated.
head(df2)
Day Hour Temp
2019-06-15 14 22.8
2019-06-15 15 22.4
2019-06-15 16 21.9
I would like to merge the data to look something like this:
Day MaxTemp MinTemp Hour Temp
2019-06-15 14 22.8 23.8 14.4
2019-06-15 15 22.4 23.8 14.4
2019-06-15 16 21.9 23.8 14.4
But what I end up with is:
allData <-merge(df1, df2, by="Day", all.y=T)
head(allData)
Day Hour Temp MaxTemp MinTemp
2019-06-15 14 22.8 NA NA
2019-06-15 15 22.4 NA NA
2019-06-15 16 21.9 NA NA
Or if I try "all = T" in the arguments I get "Error in x[[n]][i] <- value[[n]] : replacement has length zero".
Does anyone have any idea how I can fix this?
Edit:
# head of df1
df1 <- structure(list(Day = structure(list(sec = c(0, 0, 0, 0, 0, 0),
min = c(0L, 0L, 0L, 0L, 0L, 0L), hour = c(0L, 0L, 0L, 0L,
0L, 0L), mday = 15:20, mon = c(5L, 5L, 5L, 5L, 5L, 5L), year = c(119L,
119L, 119L, 119L, 119L, 119L), wday = c(6L, 0L, 1L, 2L, 3L,
4L), yday = 165:170, isdst = c(1L, 1L, 1L, 1L, 1L, 1L), zone = c("CDT",
"CDT", "CDT", "CDT", "CDT", "CDT"), gmtoff = c(NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_
)), class = c("POSIXlt", "POSIXt")), Max = c(23.8, 24.9, 23.2, 22.4, 25.1, 24.4), Min = c(14.4, 11.7, 8.7, 8.7, 9.8, 10)), row.names = c(NA, 6L), class ="data.frame")
# head of df2
df2 <- structure(list(Date = structure(list(sec = c(0, 0, 0, 0, 0, 0),
min = c(0L,30L, 0L, 30L, 0L, 30L), hour = c(14L, 14L, 15L, 15L, 16L, 16L),
mday = c(15L, 15L, 15L, 15L, 15L, 15L), mon = c(5L, 5L, 5L, 5L, 5L, 5L),
year = c(119L, 119L, 119L, 119L, 119L, 119L), wday = c(6L, 6L, 6L, 6L, 6L,
6L), yday = c(165L,165L, 165L, 165L, 165L, 165L), isdst = c(1L, 1L, 1L, 1L,
1L, 1L), zone =c("CDT", "CDT", "CDT", "CDT", "CDT", "CDT"), gmtoff =
c(NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_)),class = c("POSIXlt","POSIXt")), Temp = c(22.8, 22.4, 22.4,
22.3,21.9, 21.3), Hour =c(14L, 14L, 15L, 15L, 16L, 16L), Day =
structure(c(18062,18062, 18062, 18062, 18062, 18062), class = "Date")),
row.names= c(NA, 6L), class = "data.frame")
Confirmed with your dput output:
class(df1$Day)
# [1] "POSIXlt" "POSIXt"
class(df2$Day)
# [1] "Date"
You need to convert one to the other's class, perhaps df1$Day is the same time-of-day for each value (in this set), then you can do
merge(df1, df2, by = "Day", all.y = TRUE)
# Day Max Min Date Temp Hour
# 1 2019-06-15 NA NA 2019-06-15 14:00:00 22.8 14
# 2 2019-06-15 NA NA 2019-06-15 14:30:00 22.4 14
# 3 2019-06-15 NA NA 2019-06-15 15:00:00 22.4 15
# 4 2019-06-15 NA NA 2019-06-15 15:30:00 22.3 15
# 5 2019-06-15 NA NA 2019-06-15 16:00:00 21.9 16
# 6 2019-06-15 NA NA 2019-06-15 16:30:00 21.3 16
df1$Day <- as.Date(df1$Day)
merge(df1, df2, by = "Day", all.y = TRUE)
# Day Max Min Date Temp Hour
# 1 2019-06-15 23.8 14.4 2019-06-15 14:00:00 22.8 14
# 2 2019-06-15 23.8 14.4 2019-06-15 14:30:00 22.4 14
# 3 2019-06-15 23.8 14.4 2019-06-15 15:00:00 22.4 15
# 4 2019-06-15 23.8 14.4 2019-06-15 15:30:00 22.3 15
# 5 2019-06-15 23.8 14.4 2019-06-15 16:00:00 21.9 16
# 6 2019-06-15 23.8 14.4 2019-06-15 16:30:00 21.3 16
I'll go out on a limb and say that the class of your Day columns is different.
Going with "raw data" as copied from the question, Day will be strings for both frames:
df1 <- read.table(header = TRUE, text = "
Day MaxTemp MinTemp
2019-06-15 23.8 14.4
2019-06-16 24.9 11.7
2019-06-17 23.2 8.7")
df2 <- read.table(header = TRUE, text = "
Day Hour Temp
2019-06-15 14 22.8
2019-06-15 15 22.4
2019-06-15 16 21.9")
str(lapply(df1, class))
# List of 3
# $ Day : chr "character"
# $ MaxTemp: chr "numeric"
# $ MinTemp: chr "numeric"
merge(df1, df2, by = "Day")
# Day MaxTemp MinTemp Hour Temp
# 1 2019-06-15 23.8 14.4 14 22.8
# 2 2019-06-15 23.8 14.4 15 22.4
# 3 2019-06-15 23.8 14.4 16 21.9
If I convert one of them to a Date class:
df1$Day <- as.Date(df1$Day)
str(lapply(df1, class))
# List of 3
# $ Day : chr "Date"
# $ MaxTemp: chr "numeric"
# $ MinTemp: chr "numeric"
merge(df1, df2, by = "Day", all.y = TRUE)
# Day MaxTemp MinTemp Hour Temp
# 1 2019-06-15 NA NA 14 22.8
# 2 2019-06-15 NA NA 15 22.4
# 3 2019-06-15 NA NA 16 21.9
Fixes include:
Converting the other frame's Day to a date:
df2$Day <- as.Date(df2$Day)
merge(df1, df2, by = "Day", all.y = TRUE)
# Day MaxTemp MinTemp Hour Temp
# 1 2019-06-15 23.8 14.4 14 22.8
# 2 2019-06-15 23.8 14.4 15 22.4
# 3 2019-06-15 23.8 14.4 16 21.9
Converting both Day columns back to character (or factor):
df1$Day <- as.character(df1$Day)
df2$Day <- as.character(df2$Day)
merge(df1, df2, by = "Day", all.y = TRUE)
# Day MaxTemp MinTemp Hour Temp
# 1 2019-06-15 23.8 14.4 14 22.8
# 2 2019-06-15 23.8 14.4 15 22.4
# 3 2019-06-15 23.8 14.4 16 21.9
Though in this case it's likely (and perhaps even recommended) that you convert them back to Date at some point (since it is a numeric data type, after all).
My data look like the following example.
# A tibble: 18 x 4
DATE AUTHOR PRODUCT SALES
<dttm> <chr> <chr> <dbl>
1 2019-11-27 James B 80
2 2019-11-28 James B 100
3 2019-11-27 James A 80
4 2019-11-28 James A 100
5 2019-11-26 Frank B 70
6 2019-11-27 Frank B 75
7 2019-11-28 Frank B 65
8 2019-11-26 Frank A 70
9 2019-11-27 Frank A 75
10 2019-11-28 Frank A 65
11 2019-11-25 Mary A 100
12 2019-11-26 Mary A 80
13 2019-11-27 Mary A 95
14 2019-11-28 Mary A 110
15 2019-11-25 Mary B 100
16 2019-11-26 Mary B 80
17 2019-11-27 Mary B 95
18 2019-11-28 Mary B 110
I would like to add a "DIFF" column where the difference over day for SALES is calculated grouping by AUTHOR. My issues here are the following:
I have a different number of rows for every AUTHOR.
The same DATE could be repeated for some AUTHORS to report different information (in this example is PRODUCT), but the value for SALES will always remain the same, since it only depends on the DATE and the AUTHOR.
I have to keep every row in the dataset because every row contains specific information, so I can not just drop the rows where DATE is a duplicated.
Ideally I would implement the whole with a loop function in my script.
My desired outcome would be:
# A tibble: 18 x 4
DATE AUTHOR PRODUCT SALES DIFF
<dttm> <chr> <chr> <dbl>
1 2019-11-27 James B 80
2 2019-11-28 James B 100 20
3 2019-11-27 James A 80
4 2019-11-28 James A 100 20
5 2019-11-26 Frank B 70
6 2019-11-27 Frank B 75 5
7 2019-11-28 Frank B 65 -10
8 2019-11-26 Frank A 70
9 2019-11-27 Frank A 75 5
10 2019-11-28 Frank A 65 -10
11 2019-11-25 Mary A 100
12 2019-11-26 Mary A 80 -20
13 2019-11-27 Mary A 95 15
14 2019-11-28 Mary A 110 15
15 2019-11-25 Mary B 100
16 2019-11-26 Mary B 80 -20
17 2019-11-27 Mary B 95 15
18 2019-11-28 Mary B 110 15
I tried different things with dplyr and mutate but nothing seemed to work. Anyone has suggestions?
Thank you!
You could use lag to subtract previous value by group
library(dplyr)
df %>% group_by(AUTHOR, PRODUCT) %>% mutate(diff = SALES - lag(SALES))
# DATE AUTHOR PRODUCT SALES diff
# <fct> <fct> <fct> <int> <int>
# 1 2019-11-27 James B 80 NA
# 2 2019-11-28 James B 100 20
# 3 2019-11-27 James A 80 NA
# 4 2019-11-28 James A 100 20
# 5 2019-11-26 Frank B 70 NA
# 6 2019-11-27 Frank B 75 5
# 7 2019-11-28 Frank B 65 -10
# 8 2019-11-26 Frank A 70 NA
# 9 2019-11-27 Frank A 75 5
#10 2019-11-28 Frank A 65 -10
#11 2019-11-25 Mary A 100 NA
#12 2019-11-26 Mary A 80 -20
#13 2019-11-27 Mary A 95 15
#14 2019-11-28 Mary A 110 15
#15 2019-11-25 Mary B 100 NA
#16 2019-11-26 Mary B 80 -20
#17 2019-11-27 Mary B 95 15
#18 2019-11-28 Mary B 110 15
Or using diff
df %>% group_by(AUTHOR, PRODUCT) %>% mutate(diff = c(NA, diff(SALES)))
data
df <- structure(list(DATE = structure(c(3L, 4L, 3L, 4L, 2L, 3L, 4L,
2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L), .Label = c("2019-11-25",
"2019-11-26", "2019-11-27", "2019-11-28"), class = "factor"),
AUTHOR = structure(c(2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L,
1L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), .Label = c("Frank",
"James", "Mary"), class = "factor"), PRODUCT = structure(c(2L,
2L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L), .Label = c("A", "B"), class = "factor"), SALES = c(80L,
100L, 80L, 100L, 70L, 75L, 65L, 70L, 75L, 65L, 100L, 80L,
95L, 110L, 100L, 80L, 95L, 110L)), class = "data.frame", row.names = c("1",
"2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13",
"14", "15", "16", "17", "18"))
We can use shift from data.table
library(data.table)
setDT(df)[, diff := SALES - shift(SALES), .(AUTHOR, PRODUCT)][]
Suppose I have the following dataframe:
dc tmin tmax cint wcmin wcmax wsmin wsmax gsmin gsmax wd rmin rmax cir lr
1: 24 -1 4 5 -5 -2 20 25 35 40 90 11.8 26.6 14.8 3
2: 41 -3 5 8 -8 -3 15 20 35 40 90 10.0 23.5 13.5 3
3: 48 0 5 5 -4 0 30 35 45 50 45 7.3 19.0 11.7 6
4: 50 0 5 5 -4 0 30 35 45 50 45 7.3 19.0 11.7 6
5: 52 3 5 2 -3 1 20 25 35 40 45 6.7 17.4 10.7 6
6: 57 -2 5 7 -6 -1 25 30 35 40 315 4.4 13.8 9.4 7
lc wc li yd yr nF factdcx
1: 1 3 TRUE 1 2010 2 24
2: 1 3 TRUE 1 2010 8 41
3: 2 3 TRUE 1 2010 0 48
4: 2 3 TRUE 1 2010 0 50
5: 2 3 TRUE 1 2010 0 52
6: 3 3 FALSE 1 2010 0 57
I'd like to turn it into a new dataframe like the following:
dc tmin tmax cint wcmin wcmax wsmin wsmax gsmin gsmax wd rmin rmax cir lr
1: 24 -1 4 5 -5 -2 20 25 35 40 90 11.8 26.6 14.8 3
2: 41 -3 5 8 -8 -3 15 20 35 40 90 10.0 23.5 13.5 3
3: 48 0 5 5 -4 0 30 35 45 50 45 7.3 19.0 11.7 6
4: 52 3 5 2 -3 1 20 25 35 40 45 6.7 17.4 10.7 6
5: 57 -2 5 7 -6 -1 25 30 35 40 315 4.4 13.8 9.4 7
lc wc li yd yr nF factdcx
1: 1 3 TRUE 1 2010 2 24
2: 1 3 TRUE 1 2010 8 41
3: 2 3 TRUE 1 2010 0 (sum of nF for 48 and 50, factdcx) 48
4: 2 3 TRUE 1 2010 0 52
5: 3 3 FALSE 1 2010 0 57
How can I do it? (Surely, the dataframe, abc, is much larger, but I want the sum of all categories of 48 and 50 and group it into a new category, say '48').
Many thanks!
> dput(head(abc1))
structure(list(dc = c(24L, 41L, 48L, 50L, 52L, 57L), tmin = c(-1L,
-3L, 0L, 0L, 3L, -2L), tmax = c(4L, 5L, 5L, 5L, 5L, 5L), cint = c(5L,
8L, 5L, 5L, 2L, 7L), wcmin = c(-5L, -8L, -4L, -4L, -3L, -6L),
wcmax = c(-2L, -3L, 0L, 0L, 1L, -1L), wsmin = c(20L, 15L,
30L, 30L, 20L, 25L), wsmax = c(25L, 20L, 35L, 35L, 25L, 30L
), gsmin = c(35L, 35L, 45L, 45L, 35L, 35L), gsmax = c(40L,
40L, 50L, 50L, 40L, 40L), wd = c(90L, 90L, 45L, 45L, 45L,
315L), rmin = c(11.8, 10, 7.3, 7.3, 6.7, 4.4), rmax = c(26.6,
23.5, 19, 19, 17.4, 13.8), cir = c(14.8, 13.5, 11.7, 11.7,
10.7, 9.4), lr = c(3L, 3L, 6L, 6L, 6L, 7L), lc = c(1L, 1L,
2L, 2L, 2L, 3L), wc = c(3L, 3L, 3L, 3L, 3L, 3L), li = c(TRUE,
TRUE, TRUE, TRUE, TRUE, FALSE), yd = c(1L, 1L, 1L, 1L, 1L,
1L), yr = c(2010L, 2010L, 2010L, 2010L, 2010L, 2010L), nF = c(2L,
8L, 0L, 0L, 0L, 0L), factdcx = structure(1:6, .Label = c("24",
"41", "48", "50", "52", "57", "70"), class = "factor")), .Names = c("dc",
"tmin", "tmax", "cint", "wcmin", "wcmax", "wsmin", "wsmax", "gsmin",
"gsmax", "wd", "rmin", "rmax", "cir", "lr", "lc", "wc", "li",
"yd", "yr", "nF", "factdcx"), class = c("data.table", "data.frame"
), row.names = c(NA, -6L), .internal.selfref = <pointer: 0x054b24a0>)
Still got a problem, sir/madam:
> head(abc1 (updated))
dc tmin tmax cint wcmin wcmax wsmin wsmax gsmin gsmax wd rmin rmax cir lr
1: 24 -1 4 5 -5 -2 20 25 35 40 90 11.8 26.6 14.8 3
2: 41 -3 5 8 -8 -3 15 20 35 40 90 10.0 23.5 13.5 3
3: 48 0 5 5 -4 0 30 35 45 50 45 7.3 19.0 11.7 6
4: 52 3 5 2 -3 1 20 25 35 40 45 6.7 17.4 10.7 6
5: 57 -2 5 7 -6 -1 25 30 35 40 315 4.4 13.8 9.4 7
6: 70 -2 3 5 -4 -1 20 25 30 35 360 3.6 10.2 6.6 7
lc wc li yd yr nF factdcx
1: 1 3 TRUE 1 2010 2 24
2: 1 3 TRUE 1 2010 8 41
3: 2 3 TRUE 1 2010 57 48
4: 2 3 TRUE 1 2010 0 52
5: 3 3 FALSE 1 2010 0 57
6: 3 2 TRUE 1 2010 1 70
The sum of nF was incorrect, it should be zero.
Try
library(data.table)
unique(setDT(df1)[, factdcx:= as.character(factdcx)][factdcx %chin%
c('48','50'), c('dc', 'factdcx', 'nF') := list('48', '48', sum(nF))])
# dc tmin tmax cint wcmin wcmax wsmin wsmax gsmin gsmax wd rmin rmax cir lr
#1: 24 -1 4 5 -5 -2 20 25 35 40 90 11.8 26.6 14.8 3
#2: 41 -3 5 8 -8 -3 15 20 35 40 90 10.0 23.5 13.5 3
#3: 48 0 5 5 -4 0 30 35 45 50 45 7.3 19.0 11.7 6
#4: 52 3 5 2 -3 1 20 25 35 40 45 6.7 17.4 10.7 6
#5: 57 -2 5 7 -6 -1 25 30 35 40 315 4.4 13.8 9.4 7
# lc wc li yd yr nF factdcx
#1: 1 3 TRUE 1 2010 2 24
#2: 1 3 TRUE 1 2010 8 41
#3: 2 3 TRUE 1 2010 0 48
#4: 2 3 TRUE 1 2010 0 52
#5: 3 3 FALSE 1 2010 0 57
For abc1,
res1 <- unique(setDT(abc1)[, factdcx:= as.character(factdcx)][factdcx %chin%
c('48','50'), c('dc', 'factdcx', 'nF') := list(48, '48', sum(nF))])
res1
# dc tmin tmax cint wcmin wcmax wsmin wsmax gsmin gsmax wd rmin rmax cir lr
#1: 24 -1 4 5 -5 -2 20 25 35 40 90 11.8 26.6 14.8 3
#2: 41 -3 5 8 -8 -3 15 20 35 40 90 10.0 23.5 13.5 3
#3: 48 0 5 5 -4 0 30 35 45 50 45 7.3 19.0 11.7 6
#4: 52 3 5 2 -3 1 20 25 35 40 45 6.7 17.4 10.7 6
#5: 57 -2 5 7 -6 -1 25 30 35 40 315 4.4 13.8 9.4 7
# lc wc li yd yr nF factdcx
#1: 1 3 TRUE 1 2010 2 24
#2: 1 3 TRUE 1 2010 8 41
#3: 2 3 TRUE 1 2010 0 48
#4: 2 3 TRUE 1 2010 0 52
#5: 3 3 FALSE 1 2010 0 57
data
df1 <- structure(list(dc = structure(1:6, .Label = c("24", "41",
"48",
"50", "52", "57"), class = "factor"), tmin = c(-1L, -3L, 0L,
0L, 3L, -2L), tmax = c(4L, 5L, 5L, 5L, 5L, 5L), cint = c(5L,
8L, 5L, 5L, 2L, 7L), wcmin = c(-5L, -8L, -4L, -4L, -3L, -6L),
wcmax = c(-2L, -3L, 0L, 0L, 1L, -1L), wsmin = c(20L, 15L,
30L, 30L, 20L, 25L), wsmax = c(25L, 20L, 35L, 35L, 25L, 30L
), gsmin = c(35L, 35L, 45L, 45L, 35L, 35L), gsmax = c(40L,
40L, 50L, 50L, 40L, 40L), wd = c(90L, 90L, 45L, 45L, 45L,
315L), rmin = c(11.8, 10, 7.3, 7.3, 6.7, 4.4), rmax = c(26.6,
23.5, 19, 19, 17.4, 13.8), cir = c(14.8, 13.5, 11.7, 11.7,
10.7, 9.4), lr = c(3L, 3L, 6L, 6L, 6L, 7L), lc = c(1L, 1L,
2L, 2L, 2L, 3L), wc = c(3L, 3L, 3L, 3L, 3L, 3L), li = c(TRUE,
TRUE, TRUE, TRUE, TRUE, FALSE), yd = c(1L, 1L, 1L, 1L, 1L,
1L), yr = c(2010L, 2010L, 2010L, 2010L, 2010L, 2010L), nF = c(2L,
8L, 0L, 0L, 0L, 0L), factdcx = structure(1:6, .Label = c("24",
"41", "48", "50", "52", "57"), class = "factor")), .Names = c("dc",
"tmin", "tmax", "cint", "wcmin", "wcmax", "wsmin", "wsmax", "gsmin",
"gsmax", "wd", "rmin", "rmax", "cir", "lr", "lc", "wc", "li",
"yd", "yr", "nF", "factdcx"), row.names = c("1:", "2:", "3:",
"4:", "5:", "6:"), class = "data.frame")
max=aggregate(cbind(a$VALUE,Date=a$DATE) ~ format(a$DATE, "%m") + cut(a$CLASS, breaks=c(0,2,4,6,8,10,12,14)) , data = a, max)[-1]
max$DATE=as.Date(max$DATE, origin = "1970-01-01")
Sample Data :
DATE GRADE VALUE
2008-09-01 1 20
2008-09-02 2 30
2008-09-03 3 50
.
.
2008-09-30 2 75
.
.
2008-10-01 1 95
.
.
2008-11-01 4 90
.
.
2008-12-01 1 70
2008-12-02 2 40
2008-12-28 4 30
2008-12-29 1 40
2008-12-31 3 50
My Expected output according to above table for only first month is :
DATE GRADE VALUE
2008-09-30 (0,2] 75
2008-09-02 (2,4] 50
Output in my real data :
format(DATE, "%m")
1 09
2 10
3 11
4 12
5 09
6 10
7 11
cut(a$GRADE, breaks = c(0, 2, 4, 6, 8, 10, 12, 14)) value
1 (0,2] 0.30844444
2 (0,2] 1.00000000
3 (0,2] 1.00000000
4 (0,2] 0.73333333
5 (2,4] 0.16983488
6 (2,4] 0.09368000
7 (2,4] 0.10589335
Date
1 2008-09-30
2 2008-10-31
3 2008-11-28
4 2008-12-31
5 2008-09-30
6 2008-10-31
7 2008-11-28
The output is not according to the sample data , as the data is too big . A simple logic is that there are grades from 1 to 10 , so I want to find the highest value for a month in the corresponding grade groups . Eg : I need a highest value for each group (0,2],(0,4] etc
I used an aggregate condition with function max and two grouping it by two columns Date and Grade . Now when I run the code and display the value of max , I get 3 tables as output one after the other. Now I want to plot this output but i am not able to do that because of this .So how can i merge all these output ?
Try:
library(dplyr)
a %>%
group_by(MONTH=format(DATE, "%m"), GRADE=cut(GRADE, breaks=seq(0,14,by=2))) %>%
summarise_each(funs(max))
# MONTH GRADE DATE VALUE
#1 09 (0,2] 2008-09-30 75
#2 09 (2,4] 2008-09-03 50
#3 10 (0,2] 2008-10-01 95
#4 11 (2,4] 2008-11-01 90
#5 12 (0,2] 2008-12-29 70
#6 12 (2,4] 2008-12-31 50
Or using data.table
library(data.table)
setDT(a)[, list(DATE=max(DATE), VALUE=max(VALUE)),
by= list(MONTH=format(DATE, "%m"),
GRADE=cut(GRADE, breaks=seq(0,14, by=2)))]
# MONTH GRADE DATE VALUE
#1: 09 (0,2] 2008-09-30 75
#2: 09 (2,4] 2008-09-03 50
#3: 10 (0,2] 2008-10-01 95
#4: 11 (2,4] 2008-11-01 90
#5: 12 (0,2] 2008-12-29 70
#6: 12 (2,4] 2008-12-31 50
Or using aggregate
res <- transform(with(a,
aggregate(cbind(VALUE, DATE),
list(MONTH=format(DATE, "%m") ,GRADE=cut(GRADE, breaks=seq(0,14, by=2))), max)),
DATE=as.Date(DATE, origin="1970-01-01"))
res[order(res$MONTH),]
# MONTH GRADE VALUE DATE
#1 09 (0,2] 75 2008-09-30
#4 09 (2,4] 50 2008-09-03
#2 10 (0,2] 95 2008-10-01
#5 11 (2,4] 90 2008-11-01
#3 12 (0,2] 70 2008-12-29
#6 12 (2,4] 50 2008-12-31
data
a <- structure(list(DATE = structure(c(14123, 14124, 14125, 14152,
14153, 14184, 14214, 14215, 14241, 14242, 14244), class = "Date"),
GRADE = c(1L, 2L, 3L, 2L, 1L, 4L, 1L, 2L, 4L, 1L, 3L), VALUE = c(20L,
30L, 50L, 75L, 95L, 90L, 70L, 40L, 30L, 40L, 50L)), .Names = c("DATE",
"GRADE", "VALUE"), row.names = c(NA, -11L), class = "data.frame")
Update
If you want to include YEAR also in the grouping
library(dplyr)
a %>%
group_by(MONTH=format(DATE, "%m"), YEAR=format(DATE, "%Y"), GRADE=cut(GRADE, breaks=seq(0,14, by=2)))%>%
summarise_each(funs(max))
# MONTH YEAR GRADE DATE VALUE
#1 09 2008 (0,2] 2008-09-30 75
#2 09 2008 (2,4] 2008-09-03 50
#3 09 2009 (0,2] 2009-09-30 75
#4 09 2009 (2,4] 2009-09-03 50
#5 10 2008 (0,2] 2008-10-01 95
#6 10 2009 (0,2] 2009-10-01 95
#7 11 2008 (2,4] 2008-11-01 90
#8 11 2009 (2,4] 2009-11-01 90
#9 12 2008 (0,2] 2008-12-29 70
#10 12 2008 (2,4] 2008-12-31 50
#11 12 2009 (0,2] 2009-12-29 70
#12 12 2009 (2,4] 2009-12-31 50
data
a <- structure(list(DATE = structure(c(14123, 14124, 14125, 14152,
14153, 14184, 14214, 14215, 14241, 14242, 14244, 14488, 14489,
14490, 14517, 14518, 14549, 14579, 14580, 14606, 14607, 14609
), class = "Date"), GRADE = c(1L, 2L, 3L, 2L, 1L, 4L, 1L, 2L,
4L, 1L, 3L, 1L, 2L, 3L, 2L, 1L, 4L, 1L, 2L, 4L, 1L, 3L), VALUE = c(20L,
30L, 50L, 75L, 95L, 90L, 70L, 40L, 30L, 40L, 50L, 20L, 30L, 50L,
75L, 95L, 90L, 70L, 40L, 30L, 40L, 50L)), .Names = c("DATE",
"GRADE", "VALUE"), row.names = c("1", "2", "3", "4", "5", "6",
"7", "8", "9", "10", "11", "12", "21", "31", "41", "51", "61",
"71", "81", "91", "101", "111"), class = "data.frame")
Following code using base R may be helpful (using 'a' dataframe from akrun's answer):
xx = strsplit(as.character(a$DATE), '-')
a$month = sapply(strsplit(as.character(a$DATE), '-'),'[',2)
gradeCats = cut(a$GRADE, breaks = c(0, 2, 4, 6, 8, 10, 12, 14))
aggregate(VALUE~month+gradeCats, data= a, max)
month gradeCats VALUE
1 09 (0,2] 75
2 10 (0,2] 95
3 12 (0,2] 70
4 09 (2,4] 50
5 11 (2,4] 90
6 12 (2,4] 50
I have a df like this:
> dat
gen M1 M1 M1 M1 M2 M2 M2
G1 150 142 130 105 96
G2 150 145 142 130 96 89
G3 150 145 130 105 96
G4 145 142 130 105 89
G5 150 142 130 105 96
G6 145 142 130 96 89
G7 150 142 105 96
G8 150 145 130 105 89
G9 150 145 142 96 89
Here, data are present in duplicated ids. I like to aggergate like this:
>dat1
gen M1 M1 M1 M1 agg M2 M2 M2 agg
G1 150 142 130 150/142/130 105 96 105/96
G2 150 145 142 130 150/145/142/130 96 89 96/89
G3 150 145 130 150/145/130 105 96 105/96
G4 145 142 130 145/142/430 105 89 105/89
G5 150 142 130 150/142/130 105 96 105/96
G6 145 142 130 145/142/130 96 89 96/89
G7 150 142 150/142 105 96 105/96
G8 150 145 130 150/145/130 105 89 105/89
G9 150 145 142 150/145/142 96 89 96/89
here, in agg column i aggregated all the values based on duplicate first row.
I like to create new column at the end of the duplicate columns and aggregate it.
How to do it in R. I am very confused
EDIT:
dput(dat)
structure(list(V1 = structure(c(10L, 1L, 2L, 3L, 4L, 5L, 6L,
7L, 8L, 9L), .Label = c("G1", "G2", "G3", "G4", "G5", "G6", "G7",
"G8", "G9", "gen"), class = "factor"), V2 = structure(c(2L, 1L,
1L, 1L, NA, 1L, NA, 1L, 1L, 1L), .Label = c("150", "M1"), class = "factor"),
V3 = structure(c(2L, NA, 1L, 1L, 1L, NA, 1L, NA, 1L, 1L), .Label = c("145",
"M1"), class = "factor"), V4 = structure(c(2L, 1L, 1L, NA,
1L, 1L, 1L, 1L, NA, 1L), .Label = c("142", "M1"), class = "factor"),
V5 = structure(c(2L, 1L, 1L, 1L, 1L, 1L, 1L, NA, 1L, NA), .Label = c("130",
"M1"), class = "factor"), V6 = structure(c(2L, 1L, NA, 1L,
1L, 1L, NA, 1L, 1L, NA), .Label = c("105", "M2"), class = "factor"),
V7 = structure(c(2L, 1L, 1L, 1L, NA, 1L, 1L, 1L, NA, 1L), .Label = c("96",
"M2"), class = "factor"), V8 = structure(c(2L, NA, 1L, NA,
1L, NA, 1L, NA, 1L, 1L), .Label = c("89", "M2"), class = "factor")), .Names = c("V1",
"V2", "V3", "V4", "V5", "V6", "V7", "V8"), class = "data.frame", row.names = c(NA,
-10L))
This works if the missing values are blanks:
dat$agg1 <- apply(dat[,2:5],1,function(x)paste(x[nchar(x)>0],collapse="/"))
dat$agg2 <- apply(dat[,6:8],1,function(x)paste(x[nchar(x)>0],collapse="/"))
dat <- dat[,c(1:5,9,6:8,10)]
dat
# gen M1 M1.1 M1.2 M1.3 agg1 M2 M2.1 M2.2 agg2
# 1 G1 150 142 130 150/142/130 105 96 105/96
# 2 G2 150 145 142 130 150/145/142/130 96 89 96/89
# 3 G3 150 145 130 150/145/130 105 96 105/96
# 4 G4 145 142 130 145/142/130 105 89 105/89
# ...
This works if the missing values are NA
dat$agg1 <- apply(dat[,2:5],1,function(x)paste(x[!is.na(x)],collapse="/"))
dat$agg2 <- apply(dat[,6:8],1,function(x)paste(x[!is.na(x)],collapse="/"))
to aggregate them into a character vector you use paste()
x=data.frame(x1=1:10,x2=1:10,x1=11:20)
#now notice that r created my x object with three columns x1,x2 and x1.1
xnew=cbind(x,agg=paste(x$x1,x$x2,x$x1.1,sep="/"))
I am not sure if this is what you want to do because I am a bit confused about the structure of your data.
Here is my script... I Know some of you guys can make it simple and elegant!
I transposed my df (a simple example) and read as table.
> dat<-read.table("dat.txt", header=T, sep="\t", na.strings="")
> dat
gen A B C D
1 M1 1 NA 3 NA
2 M1 NA 6 NA 3
3 M1 4 8 NA NA
4 M1 NA NA 6 3
5 M2 8 NA 6 NA
6 M2 NA 2 NA 6
7 M3 3 8 NA 2
8 M3 8 9 5 NA
9 M4 3 7 8 5
10 M4 5 NA 3 2
> final<-NULL
> for(i in 1:4){
+ mar<-as.character(dat[1,1])
+ dat1<-dat[dat[,1]%in% c(mar),]
+ dat <- dat[!dat[,1]%in% c(mar),]
+ dat2 <- apply(dat1,2,function(x)paste(x[!is.na(x)],collapse="/"))
+ dat2$gen<-mar
+ dat3<-rbind(dat1,dat2)
+ final<-rbind(final, dat3)
+ }
Warning messages:
1: In dat2$gen <- mar : Coercing LHS to a list
2: In dat2$gen <- mar : Coercing LHS to a list
3: In dat2$gen <- mar : Coercing LHS to a list
4: In dat2$gen <- mar : Coercing LHS to a list
> final
gen A B C D
1 M1 1 <NA> 3 <NA>
2 M1 <NA> 6 <NA> 3
3 M1 4 8 <NA> <NA>
4 M1 <NA> <NA> 6 3
5 M1 1/ 4 6/ 8 3/ 6 3/ 3
51 M2 8 <NA> 6 <NA>
6 M2 <NA> 2 <NA> 6
31 M2 8 2 6 6
7 M3 3 8 <NA> 2
8 M3 8 9 5 <NA>
32 M3 3/8 8/9 5 2
9 M4 3 7 8 5
10 M4 5 <NA> 3 2
33 M4 3/5 7 8/3 5/2