how do you re-organize cells in R - r

I have this df:
structure(list(YEAR = c("2007", "2007", "2007", "2008", "2008",
"2008", "2008", "2008", "2008", "2008"), MONTH = c("12", "10",
"11", "01", "03", "05", "06", "08", "09", "10"), TOTAL = c(85055988L,
21567576L, 82763640L, 91007916L, 93936288L, 99646750L, 90091044L,
98811936L, 96888876L, 100909236L)), .Names = c("YEAR", "MONTH",
"TOTAL"), row.names = c("24801", "33863", "34055", "24973", "25046",
"25295", "25384", "25541", "25861", "27319"), class = "data.frame")
I would like to organize this data frame as follows:
YEAR JAN FEB MARCH .... DEC
2009 TOTAL VALUE FOR EACH month goes to each corresponding cells.
any ideas how I could easily accomplish this in R?

dcast and xtabs are among the options to consider:
xtabs(TOTAL ~ YEAR + MONTH, df)
# MONTH
# YEAR 01 03 05 06 08 09 10 11 12
# 2007 0 0 0 0 0 0 21567576 82763640 85055988
# 2008 91007916 93936288 99646750 90091044 98811936 96888876 100909236 0 0
library(reshape2)
dcast(df, YEAR ~ MONTH, value.var="TOTAL", fun.aggregate=sum)
# YEAR 01 03 05 06 08 09 10 11 12
# 1 2007 0 0 0 0 0 0 21567576 82763640 85055988
# 2 2008 91007916 93936288 99646750 90091044 98811936 96888876 100909236 0 0

Related

How to create new column based off information in other columns in R

I have a large dataset that spans over 20 years. I have a column for the date and another column for the hour ending (HE). I'm trying to add a new column to provide the hour by hour (hrxhr) information in a given year (so running total). So date: Jan 1, 2023, HE: 1 should be hrxhr: 1 and Dec 31, 2023, HE: 24, should be hrxhr:8760 (8784 on leap years).
Should look like this:
YEAR
MONTH
DAY
HOUR OF DAY
Month_num
Date
Date1
NEW COLUMN hrxhr
2023
Dec
31
22
12
2023-12-31
365
8758
2023
Dec
31
23
12
2023-12-31
365
8759
2023
Dec
31
24
12
2023-12-31
365
8760
2024
Jan
01
1
01
2024-01-01
1
1
2024
Jan
01
2
01
2024-01-01
1
2
At first I thought I could get the Julian date and then multiple that by the HE, but that is incorrect since Jan 2, 2023, HE:1 would then equal 2 but the hrxhr/running total should equal 25.
In base R:
df <- data.frame(
YEAR = c(2023L, 2023L, 2023L, 2024L, 2023L),
MONTH = c("Dec", "Dec", "Dec", "Jan", "Jan"), DAY = c(31L, 31L, 31L, 1L, 1L),
HOUR_OF_DAY = c(22L, 23L, 24L, 1L, 2L), Month_num = c(12L,
12L, 12L, 12L, 12L), Date = c("2023-12-31", "2023-12-31",
"2023-12-31", "2024-01-01", "2024-01-01"), Date1 = c(365L,
365L, 365L, 1L, 1L))
df$hrxhr <- mapply(\(from, to, by) length(seq.POSIXt(from, to, by)),
from = trunc(as.POSIXlt(df$Date), "years"),
to = as.POSIXlt(df$Date),
by="1 hour") + df$HOUR_OF_DAY - 1
df
#> YEAR MONTH DAY HOUR_OF_DAY Month_num Date Date1 hrxhr
#> 1 2023 Dec 31 22 12 2023-12-31 365 8758
#> 2 2023 Dec 31 23 12 2023-12-31 365 8759
#> 3 2023 Dec 31 24 12 2023-12-31 365 8760
#> 4 2024 Jan 1 1 12 2024-01-01 1 1
#> 5 2023 Jan 1 2 12 2024-01-01 1 2
If you are open to a tidyverse / lubridate solution, you could use
library(dplyr)
library(lubridate)
df1 %>%
mutate(
begin = ymd_hms(paste(year(Date), "-01-01 00:00:00")),
target = ymd_hms(paste(Date, HOUR_OF_DAY, ":00:00")),
hrxhr = time_length(interval(begin, target), "hours")) %>%
select(-begin, -target)
This returns
# A tibble: 5 × 7
YEAR MONTH DAY HOUR_OF_DAY Month_num Date hrxhr
<dbl> <chr> <chr> <dbl> <dbl> <date> <dbl>
1 2023 Dec 31 22 12 2023-12-31 8758
2 2023 Dec 31 23 12 2023-12-31 8759
3 2023 Dec 31 24 12 2023-12-31 8760
4 2024 Jan 01 1 12 2024-01-01 1
5 2024 Jan 01 2 12 2024-01-01 2
Data
structure(list(YEAR = c(2023, 2023, 2023, 2024, 2024), MONTH = c("Dec",
"Dec", "Dec", "Jan", "Jan"), DAY = c("31", "31", "31", "01",
"01"), HOUR_OF_DAY = c(22, 23, 24, 1, 2), Month_num = c(12, 12,
12, 12, 12), Date = structure(c(19722, 19722, 19722, 19723, 19723
), class = "Date")), row.names = c(NA, -5L), class = "data.frame")

Convert long data to wide data with multiple columns and multiple values

I have a data frame of 300K rows with 2 main columns of interest. (NAME & SUBJCT) I need to convert this data into a wide format and in addition, if I get a records for a particular subject with multiple dates, I need to place them next to each other.
I tried using tidyr::pivot_wider but I'm not able to get it work.
Sample data:
DF <- data.frame(
NAME = c("ABC", "ABC", "DEF", "ABC", "ABC", "ABC", "DEF", "ABC", "DEF", "ABC", "DEF", "DEF", "DEF", "DEF", "DEF", "DEF", "ABC"),
SUBJECT = c("MATHS", "LANGUAGE 1", "LANGUAGE 1", "LANGUAGE 2","LANGUAGE 2","LANGUAGE 2","LANGUAGE 2", "SCIENCE", "SCIENCE", "HISTORY", "PE", "ENVIRONMENT", "COMPUTERS", "COMPUTERS", "COMPUTERS", "BIOLOGY", "SANSKRIT"),
YEAR = c("2010", "2011", "2012", "2013", "2014", "2015", "2013", "2015", "2016", "2016", "2017", "2015", "2016", "2017", "2018", "2015", "2013"),
MARKS = c("45", "48", "47", "44", "48", "46", "42", "42", "43", "37", "42", "43", "42", "41", "44", "41", "44"),
MAXIMUM = c("46", rep("50", 5), "45", "50", rep("45", 9))
)
> DF
NAME SUBJECT YEAR MARKS MAXIMUM
1 ABC MATHS 2010 45 46
2 ABC LANGUAGE 1 2011 48 50
3 DEF LANGUAGE 1 2012 47 50
4 ABC LANGUAGE 2 2013 44 50
5 ABC LANGUAGE 2 2014 48 50
6 ABC LANGUAGE 2 2015 46 50
7 DEF LANGUAGE 2 2013 42 45
8 ABC SCIENCE 2015 42 50
9 DEF SCIENCE 2016 43 45
10 ABC HISTORY 2016 37 45
11 DEF PE 2017 42 45
12 DEF ENVIRONMENT 2015 43 45
13 DEF COMPUTERS 2016 42 45
14 DEF COMPUTERS 2017 41 45
15 DEF COMPUTERS 2018 44 45
16 DEF BIOLOGY 2015 41 45
17 ABC SANSKRIT 2013 44 45
My expected output is like this: (It is a bit long)
Bit tricky with pivoting twice, but here you go:
library(tidyverse)
DF %>%
group_by(NAME, SUBJECT) %>%
mutate(ind = row_number()) %>%
ungroup() %>%
pivot_longer(c("YEAR", "MARKS", "MAXIMUM")) %>%
mutate(name = paste0(name, ind)) %>%
select(-ind) %>%
pivot_wider(names_from = c("SUBJECT", "name"), values_from = "value")

Identifying the column that has the lowest value

I have a data frame given by the following
DF <- structure(list(ID = c(1, 129, 169, 1087), `Collab Years Patents` = c(NA,
"2011, 2011, 2011", "2010", "2006, 2006"), `Collab Years Publications` = c("2011",
"2015, 2016, 2016", "2010", NA), ECP = c("2011", "2011", "2010",
"2006")), .Names = c("ID", "Collab Years Patents", "Collab Years Publications",
"ECP"), row.names = c(1L, 107L, 136L, 859L), class = "data.frame")
The column ECP is the minimum year of the two collaboration columns (which could contain several years). I need an output that says which column the ECP belongs to. For example, a solution to above could be another column vector to above frame with the elements:
structure(list(ID = c(1, 129, 169, 1087), `Collab Years Patents` = c(NA,
"2011, 2011, 2011", "2010", "2006, 2006"), `Collab Years Publications` = c("2011",
"2015, 2016, 2016", "2010", NA), ECP = c("2011", "2011", "2010",
"2006"), identifier = c("Publications", "Patents", "Both", "Patents"
)), .Names = c("ID", "Collab Years Patents", "Collab Years Publications",
"ECP", "identifier"), row.names = c(1L, 107L, 136L, 859L), class = "data.frame")
Here is an option using str_detect. Loop through the collaboration columns (sapply(DF[2:3],), use str_detect to check which one of the column have the value of 'ECP'. multiply by col to convert the TRUE values to the column index, replace the NA elements with 0, get the column names correspond based on the maximum column index, remove the prefix part of the column names with sub, and assign those elements in 'm1' that are greater than 0 i.e. have 'ECP' in both to 'Both' on the created vector 'v1'
library(stringr)
m1 <- col(DF[2:3]) *sapply(DF[2:3], function(x) str_detect(x, DF$ECP))
m1[is.na(m1)] <- 0
v1 <- sub(".*\\s(\\w+)$", "\\1", names(DF)[2:3][max.col(m1)])
v1[rowSums(m1 > 0) ==2] <- "Both"
DF$identifier <- v1
DF$identifier
#[1] "Publications" "Patents" "Both" "Patents"
Using tidyverse (dplyr and purrr):
library(tidyverse)
DF %>%
mutate_at(2:3,strsplit,", ") %>%
transmute(identifier = pmap(.[2:4],~c("Publications","Patents","Both")[
2*(..3 %in% .x) + (..3 %in% .y)])) %>%
bind_cols(DF,.)
# ID Collab Years Patents Collab Years Publications ECP identifier
# 1 1 <NA> 2011 2011 Publications
# 2 129 2011, 2011, 2011 2015, 2016, 2016 2011 Patents
# 3 169 2010 2010 2010 Both
# 4 1087 2006, 2006 <NA> 2006 Patents

Efficient algorithm to calculate values in data.frame without loop

Here is the situation where I got kinda stuck with R. I have data table with one row for each day, something like this:
Date = c(as.Date("2015-12-31"), as.Date("2016-01-01"));
Month1 = c("DEC", "JAN");
Year1 = c("15", "16");
Price1 = c(100, 110);
Month2 = c(NA_character_, NA_character_);
Year2 = c(NA_character_, NA_character_);
Price2 = c(NA_integer_, NA_integer_);
Month3 = c(NA_character_, NA_character_);
Year3 = c(NA_character_, NA_character_);
Price3 = c(NA_integer_, NA_integer_);
Month4 = c(NA_character_, NA_character_);
Year4 = c(NA_character_, NA_character_);
Price4 = c(NA_integer_, NA_integer_);
dataSample = data.frame(Date, Month1, Year1, Price1, Month2, Year2, Price2, Month3, Year3, Price3, Month4, Year4, Price4);
Which gives such a table:
Date Month1 Year1 Price1 Month2 Year2 Price2 Month3 Year3 Price3 Month4 Year4 Price4
1 2015-12-31 DEC 15 100 <NA> <NA> NA <NA> <NA> NA <NA> <NA> NA
2 2016-01-01 JAN 16 110 <NA> <NA> NA <NA> <NA> NA <NA> <NA> NA
Now I need to calculate all months and prices for each. For that I have 2 other data frames:
Date = c(as.Date("2015-12-31"), as.Date("2015-12-31"), as.Date("2015-12-31"), as.Date("2016-01-01"), as.Date("2016-01-01"), as.Date("2016-01-01"));
Month.Start = c("DEC", "JAN", "FEB", "JAN", "FEB", "MAR");
Year.Start = c("15", "16", "16", "16", "16", "16")
Month.End = c("JAN", "FEB", "MAR", "FEB", "MAR", "APR");
Year.End = c("16", "16", "16", "16", "16", "16")
Diff = c(10, 15, -15, 19, -20, -5);
diffsOneMonth = data.frame(Date, Month.Start, Year.Start, Month.End, Year.End, Diff)
Date = c(as.Date("2015-12-31"), as.Date("2016-01-01"));
Month.Start = c("DEC", "MAR");
Year.Start = c("15", "16")
Month.End = c("MAR", "JUN");
Year.End = c("16", "16")
Diff = c(11, 25);
diffsThreeMonth = data.frame(Date, Month.Start, Year.Start, Month.End, Year.End, Diff)
Which gives me these tables:
One month price differences
Date Month.Start Year.Start Month.End Year.End Diff
1 2015-12-31 DEC 15 JAN 16 10
2 2015-12-31 JAN 16 FEB 16 15
3 2015-12-31 FEB 16 MAR 16 -15
4 2016-01-01 JAN 16 FEB 16 19
5 2016-01-01 FEB 16 MAR 16 -20
6 2016-01-01 MAR 16 APR 16 -5
Three month price differences
Date Month.Start Year.Start Month.End Year.End Diff
1 2015-12-31 DEC 15 MAR 16 20
2 2016-01-01 MAR 16 JUN 16 25
Now I must fill dataSample data frame by using data from differences tables. I check what start/end months/years are available there and have to fill those months/years in dataSample. Then take difference of price and set calculated price in dataSample. So for example in dataSample we start with DEC 15, then in diffsOneMonth we have entry DEC 15 - JAN 16 with difference 10 so we add it to DEC 15 price and get JAN 16 price 110:
Date Month1 Year1 Price1 Month2 Year2 Price2 Month3 Year3 Price3 Month4 Year4 Price4
1 2015-12-31 DEC 15 100 JAN 16 110 <NA> <NA> NA <NA> <NA> NA
2 2016-01-01 JAN 16 110 <NA> <NA> NA <NA> <NA> NA <NA> <NA> NA
Now its possible to do next month and then next etc. If we use diffsOneMonth only we would get desirable result like this:
Date Month1 Year1 Price1 Month2 Year2 Price2 Month3 Year3 Price3 Month4 Year4 Price4
1 2015-12-31 DEC 15 100 JAN 16 110 FEB 16 125 MAR 16 110
2 2016-01-01 JAN 16 110 FEB 16 129 MAR 16 109 APR 16 104
However there is additional requirement that I must use wider month spread to calculate prices if its possible. So for 2015-12-31 there exists three month spread from DEC 15 to MAR 16 which should override price from one month difference. So DEC 15 price is 110 and DEC 15 - MAR 16 difference is 11 which makes MAR 16 price not 110 but 111:
Date Month1 Year1 Price1 Month2 Year2 Price2 Month3 Year3 Price3 Month4 Year4 Price4
1 2015-12-31 DEC 15 100 JAN 16 110 FEB 16 125 MAR 16 111
2 2016-01-01 JAN 16 110 FEB 16 129 MAR 16 109 APR 16 104
So for this sample it would be my final desirable output.
Real data is much more complex, with 6 and 12 month differences and 64 months forward for each date. Also some months can be missing. I tried to do it with a loop but it was very slow, however I am not sure how to approach such a problem without a loop. I have created few helper methods to be able to calculate next year/month:
nextContract = function(currentMonth, currentYear, length = 1,
years = c("10", "11", "12", "13", "14", "15", "16", "17", "18"),
months = c("JAN", "FEB", "MAR", "APR", "MAY", "JUN", "JUL", "AUG", "SEP", "OCT", "NOV", "DEC")) {
mIdx <- match(currentMonth, months)+length;
yDiff = ifelse(length(months) < mIdx, mIdx / length(months) - ifelse(mIdx %% length(months) == 0, 1, 0), 0);
return(data.frame(nextMonth(currentMonth, length, months), nextYear(currentYear, length = yDiff)))
}
nextYear = function(currentYear, length = 1, years = c("10", "11", "12", "13", "14", "15", "16", "17", "18")) {
return(years[match(currentYear, years)+length]);
}
nextMonth = function(currentMonth, length = 1, months = c("JAN", "FEB", "MAR", "APR", "MAY", "JUN", "JUL", "AUG", "SEP", "OCT", "NOV", "DEC")) {
mIdx <- match(currentMonth, months)+length;
return(months[ifelse(length(months) < mIdx, ifelse(mIdx %% length(months) != 0, mIdx %% length(months), length(months)), mIdx)]);
}
Example of usage could be:
> nextContract("DEC", "15")
nextMonth.currentMonth..length..months. nextYear.currentYear..length...yDiff.
1 JAN 16
or:
> nextContract("DEC", "15", length = 3)
nextMonth.currentMonth..length..months. nextYear.currentYear..length...yDiff.
1 MAR 16
This got to be pretty long question but I hope someone will take time to review it :)
Thanks in advance!
EDIT
A little bit of improvement on proposed solution and I got what I needed:
outrightAndForwardRows <- list("1" = diffsOneMonth, "3" = diffsThreeMonth) %>%
bind_rows(.id = "time_step") %>%
left_join(dataSample %>%
select(Date, Price1, Month1, Year1) ) %>%
mutate(Day.Start = 1) %>%
mutate(Day.End = 1) %>%
mutate(Outright.Day = 1) %>%
unite("Contract.Start", Day.Start, Month.Start, Year.Start) %>%
unite("Contract.End", Day.End, Month.End, Year.End) %>%
unite("Contract.Outright", Outright.Day, Month1, Year1) %>%
mutate(time_step = as.numeric(time_step),
Contract.Start =
Contract.Start %>%
parse_date_time("%d_%b_%y")) %>%
mutate(Contract.End =
Contract.End %>%
parse_date_time("%d_%b_%y")) %>%
mutate(Contract.Outright =
Contract.Outright %>%
parse_date_time("%d_%b_%y")) %>%
group_by(time_step, Date) %>%
arrange(Contract.End) %>%
mutate(Price = cumsum(Diff) + Price1) %>%
group_by(Date, Contract.End) %>%
slice(time_step %>% which.max) %>%
ungroup() %>%
select(-time_step, -Diff, -Contract.Start)
#### add outright and forward months to the same columns
outright <- outrightAndForwardRows %>% select(Date, Price=Price1, Contract=Contract.Outright) %>% unique
forwardMonths <- outrightAndForwardRows %>% select(Date, Contract=Contract.End, Price)
# join and sort rows
joined <- rbind(outright, forwardMonths) %>% arrange(Date, Contract)
# add contract sequence
joined = data.table(joined)
joined = joined[, Contract.seq:=seq(.N), by=Date];
dcast(joined, Date ~ Contract.seq, value.var=c("Price", "Contract"))
Something like this:
library(dplyr)
library(tidyr)
library(lubridate)
list(`1` = diffsOneMonth,
`3` = diffsThreeMonth) %>%
bind_rows(.id = "time_step") %>%
left_join(dataSample %>%
select(Date, Price1, Month1, Year1) ) %>%
mutate(Day.Start = 1) %>%
unite("Date.Start", Day.Start, Month.Start, Year.Start) %>%
mutate(time_step = as.numeric(time_step),
Date.Start =
Date.Start %>%
parse_date_time("%d_%b_%y")) %>%
group_by(time_step, Date) %>%
arrange(Date.Start) %>%
mutate(Price = cumsum(Diff) + Price1) %>%
group_by(Date, Date.Start) %>%
slice(time_step %>% which.max)

convert year-doy to mdy

I would like to convert a column (or create a new one) which is year-day of year to m/d/y. Originally I had year and day-of-year as two separate columns, but I concatenated (paste) them together because I thought I would need the year included with the day of year because of leap years. I am not opposed to using an additional package such as date.
Here is my data:
dat <- structure(list(doy = c(320, 350, 309, 310, 328, 321, 301, 338,
304, 304, 308), year = structure(1:11, .Label = c("2000", "2001",
"2002", "2003", "2004", "2005", "2006", "2007", "2008", "2009",
"2010"), class = "factor"), conc = c("2000-320", "2001-350",
"2002-309", "2003-310", "2004-328", "2005-321", "2006-301", "2007-338",
"2008-304", "2009-304", "2010-308")), row.names = c(NA, -11L), class = "data.frame", .Names = c("doy",
"year", "conc"))
And looks like:
doy year conc
1 320 2000 2000-320
2 350 2001 2001-350
3 309 2002 2002-309
4 310 2003 2003-310
5 328 2004 2004-328
6 321 2005 2005-321
7 301 2006 2006-301
8 338 2007 2007-338
9 304 2008 2008-304
10 304 2009 2009-304
11 308 2010 2010-308
-cherrytree
No additional packages necessary:
within(dat, dtime <- as.POSIXct(conc, format='%Y-%j'))
Something like this works.
as.Date( paste(as.character(dat$year), "-01-01",sep="")) + dat$doy - 1
Just adds the day of the year (minus one) to Jan 1 of the year.

Resources