I've got the following dataset:
tab <- tibble(year = c(2017,2017,2017,2018,2018,2018)
mth = c("Apr", "Apr", "Jun", "Jul", "Jul", "Sep"),
var1 = 1:6,
var2 = 10:15)
Is it possible to use kableExtra to generate a table of this data where there are two grouping variables, year and month? This would give:
var1 var2
2017
Apr
1 10
2 11
Jun
3 12
2018
Jul
4 13
5 14
Sep
6 15
I've tried:
kable(tab[,3:4]) %>% pack_rows(index = table(year$Month, tab$mth))
It works fine with one grouping variable, but it doesn't work for two grouping variables.
This tutorial has great examples and explains how to do this.
library(dplyr)
library(kableExtra)
kable(tab, align = "c", col.names = c("","",names(tab)[3:4])) %>%
kable_styling(full_width = F) %>%
column_spec(1, bold = T) %>%
collapse_rows(columns = 1:2, valign = "top")
Related
I have many cuts of my data that I eventually join together into one large dataset. However, sometimes the object is an error message because it didn't have enough sample size, causing the code to fail.
Before I do my full_joins, I want a simple way to say "If the length of any of these objects is 1, then make that object--and only those objects--have this set structure with just NAs". Is there a simple way to do that other than an if statement for each object? Or, alternatively, is there a way for R to 'skip' over the problematic rows if there's an error message (without relying on any specific characters)? I've used try(), but that doesn't always work and sometimes stops continuing to other joins.
#Here's an example of my data
library(dplyr)
object_1 <- tibble(name = c("Justin", "Corey"), month = c("Jan", "Jan"), score = c(1, 2))
object_2 <- tibble(name = c("Justin", "Corey"), month = c("Feb", "Feb"), score = c(100, 200))
object_3 <- "error message!"
object_4 <- tibble(name = c("Justin", "Corey"), month = c("Apr", "Apr"), score = c(95, 23))
object_5 <- "Another error!!"
#Here's me trying to join them, but it isn't working because of the errors
all_the_objects <- object_1 %>%
full_join(object_2) %>%
full_join(object_3) %>%
full_join(object_4) %>%
full_join(object_5)
#Here's a solution that works, but doesn't seem very elegant:
if(length(object_1) == 1) {
object_1 <- tibble(name = NA, month = NA, score = NA_real_)
} else if(length(object_2) == 1) {
object_2 <- tibble(name = NA, month = NA, score = NA_real_)
} else if(length(object_3) == 1) {
object_3 <- tibble(name = NA, month = NA, score = NA_real_)
} else if(length(object_4) == 1) {
object_4 <- tibble(name = NA, month = NA, score = NA_real_)
} else if(length(object_5) == 1) {
object_5 <- tibble(name = NA, month = NA, score = NA_real_)
}
#Now it'll work
all_the_objects <- object_1 %>%
full_join(object_2) %>%
full_join(object_3) %>%
full_join(object_4) %>%
full_join(object_5)
We may place the objects in a list and do the check at once and then join with reduce
library(dplyr)
library(purrr)
map(mget(ls(pattern = '^object_\\d+$')),
~ if(is.vector(.x)) tibble(name = NA_character_, month = NA_character_,
score = NA_real_) else .x) %>%
reduce(full_join)
-output
# A tibble: 7 × 3
name month score
<chr> <chr> <dbl>
1 Justin Jan 1
2 Corey Jan 2
3 Justin Feb 100
4 Corey Feb 200
5 <NA> <NA> NA
6 Justin Apr 95
7 Corey Apr 23
I have a dataframe like this
X 2001,2002,2003
JAN NA,1,2
JUN NA,2,3
DEC 1,2,NA
I want an empty vector to store values and generate a time series
What can I do
Intended output formated by month and year, omit NAs
output=c(1,1,2,2,2,3)
How can I do?
You might go that direction:
library(tidyverse)
dta <- tribble(
~X, ~"2001", ~"2002", ~"2003",
"JAN", NA, 1, 2,
"JUN", NA, 2, 3,
"DEC", 1, 2, NA)
dta %>%
pivot_longer(cols = '2001':'2003',
names_to = "year",
values_to = "val") %>%
arrange(year) %>%
filter(!is.na(val))
However, you need to assure that the months are sorted correctly.
I used the dcast function to show the spendings per month of different companies. Of course I want January first, then February etc. and not the alphabetical order.
Spendings <- data %>%
filter(Familie == "Riegel" & Jahr == "2017") %>%
group_by(Firma, Produktmarke, `Name Kurz`) %>%
summarise(Spendingsges = sum(EUR, na.rm = TRUE))
Spendings <- dcast(data = Spendings, Firma + Produktmarke ~ `Name Kurz`, value.var="Spendingsges")
Spendings
Firma Produktmarke Apr Aug Dez Feb Jan Jul Jun Mai Mrz Nov Okt Sep
Company1 Product1 228582 1902138 725781 NA 709970 NA 265313 228177 NA NA 1463258 4031267
Is there a way to reorder the colums dynamically ? For 2018 for example the dataframe is shorter, so i can not use:
Spendings <- Spendings[,c("Firma", "Produktmarke", "Jan", "Feb", "Mrz", "Apr", "Mai", "Jun", "Jul", "Aug", "Sep", "Okt", "Nov", "Dez")]
Spendings_raw <- data.frame(matrix(ncol = 14, nrow = 0))
colnames(Spendings_raw) <- c("Firma", "Produktmarke", "Jan", "Feb", "Mrz", "Apr", "Mai", "Jun", "Jul", "Aug", "Sep", "Okt", "Nov", "Dez")
Spendings_raw
Spendings <- data %>%
filter(Familie == "Riegel" & Jahr == "2017") %>%
group_by(Firma, Produktmarke, `Name Kurz`) %>%
summarise(Spendingsges = sum(EUR, na.rm = TRUE))
Spendings <- dcast(data = Spendings, Firma + Produktmarke ~ `Name Kurz`, value.var="Spendingsges")
Spendings <- rbind.fill(Spendings_raw, Spendings)
This works perfectly ;-).
My data are originally in week (examples below). I find it difficult to perform time series data since this data is always in the from of dd-mm-yy.
WEEK SALES
1: 29.2010 60.48
2: 30.2010 95.76
3: 31.2010 51.66
4: 32.2010 73.71
5: 33.2010 22.05
Thanks in advance!
We can convert the week number as date using functions from the lubridate package, and then plot the date on the x-axis and SALES on the y-axis.
library(tidyverse)
library(lubridate)
dat2 <- dat %>%
separate(WEEK, into = c("WEEK", "YEAR"), convert = TRUE) %>%
mutate(Date = ymd("2010-01-01") + weeks(WEEK - 1))
ggplot(dat2, aes(x = Date, y = SALES)) +
geom_line()
DATA
dat <- read.table(text = " WEEK SALES
1 '29.2010' 60.48
2 '30.2010' 95.76
3 '31.2010' 51.66
4 '32.2010' 73.71
5 '33.2010' 22.05",
header = TRUE, stringsAsFactors = FALSE,
colClasses = c("character", "character", "numeric"))
UPDATE
If the data are from different years, we can use the following code.
dat2 <- dat %>%
separate(WEEK, into = c("WEEK", "YEAR"), convert = TRUE) %>%
mutate(Date = ymd(paste(YEAR, "01", "01", sep = "-")) + weeks(WEEK - 1))
DATA
dat <- read.table(text = " WEEK SALES
1 '29.2010' 60.48
2 '30.2010' 95.76
3 '31.2010' 51.66
4 '32.2010' 73.71
5 '33.2010' 22.05
6 '1.2011' 37.5
7 '2.2011' 45.2
8 '3.2011' 62.9",
header = TRUE, stringsAsFactors = FALSE,
colClasses = c("character", "character", "numeric"))
I have mean daily data for different sites organized as shown in figure 1 in this folder.
However, I want to organize this data to look like figure 2 in the same folder.
Using this code, the data was reshaped but the final values (reshpae_stage_R.csv) didn't match the original values.
By running the code for the second time, I got this error:
Error in `row.names<-.data.frame`(`*tmp*`, value = paste(d[, idvar], times[1L], :
duplicate 'row.names' are not allowed
In addition: Warning message:
non-unique value when setting 'row.names': ‘NA.January’
Could you please help me why the final values don't match the original values?
Thanks in advance
Update:
Thanks to #aelwan for catching a bug, and the updated code is below:
library(ggplot2)
library(reshape2)
# read in the data
dfStage = read.csv("reshapeR/Data/stage.csv", header = FALSE, stringsAsFactor = FALSE)
# remove the rows which are min, max, mean & redundant columns
condMMM = stringr::str_trim(dfStage[, 1]) %in% c("Min", "Max", "Mean", "Day")
dfStage = dfStage[!condMMM, 1:13]
dateVars = c("Day", "Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec")
colnames(dfStage) = dateVars
# get indices & names of year site combinations
condlSiteYear = grepl("^Daily means", stringr::str_trim(dfStage[, 1]))
condiSiteYear = grep("^Daily means", stringr::str_trim(dfStage[, 1]))
dfSiteYear = dfStage[condlSiteYear, 1, drop = FALSE]
# remove site-year rows from data
dfStage = dfStage[!condlSiteYear, ]
# get the list of sites and years
dfSiteYear$Year = regmatches(dfSiteYear[, 1], regexpr("(?<=Year\\s)([0-9]+)", dfSiteYear[, 1], perl = TRUE))
dfSiteYear$Site = regmatches(dfSiteYear[, 1],
regexpr("(?<=(Stage\\s\\(mm\\)\\sat\\s))([A-Za-z\\s0-9\\.]+)", dfSiteYear[, 1], perl = TRUE))
# add the site and years
dfSiteYearLong = dfSiteYear[rep(1:dim(dfSiteYear)[1], each = 31), c("Site", "Year")]
dfStageFinal = cbind(dfStage, dfSiteYearLong)
# reshape
dfStageFinalLong = reshape2::melt(dfStageFinal, id.vars = c("Day", "Site", "Year"),
measure.vars = dateVars[-1],
variable.name = "Month")
dfStageFinalWide = reshape2::dcast(dfStageFinalLong, Day + Month + Year ~ Site,
value.var = "value")
# cleanup
dfStageFinalWide[, -c(1:3)] = lapply(dfStageFinalWide[, -c(1:3)], as.numeric)
# create a date variable
dfStageFinalWide$Date = with(dfStageFinalWide,
as.Date(paste(Day, Month, Year, sep = "-"),
format = "%d-%b-%Y"))
# remove the infeasible dates
dfStageFinalWide = dfStageFinalWide[!is.na(dfStageFinalWide$Date), ]
dfStageFinalWide = dfStageFinalWide[order(dfStageFinalWide$Date), ]
# plot the values over time
dfStageFinalLong =
reshape2::melt(dfStageFinalWide, id.vars = "Date", measure.vars = unique(dfSiteYear$Site),
variable.name = "Site")
ggplot(dfStageFinalLong, aes(x = Date, y = value, color = Site))+
geom_line() + theme_bw() + facet_wrap(~ Site, scale = "free_y")
This leads to the picture below:
Original answer:
This example requires a fair amount of data munging skills. You basically have to note the repeating patters in the data -- the data are site-year measurements organized as day x month tables.
Recipe:
Here is a recipe for creating the desired dataset:
1. Remove the rows & columns in the data that are redundant.
2. Extract the rows that identify the year and the site of the table using pattern matching (grep).
3. From the longer string, extract the year and site name using regular expressions (regexpr and regmatches).
4. Find the starting row indices of the tables for each site-year combination and assign the site-year names just extracted to all rows that correspond to that site & year.
5. Now you can go ahead and reshape it into any shape you want. In the code below, the row identifiers are year, month and day, and the columns are the sites.
6. Some cleanup, and you are good to go.
Code:
Here is code for the recipe above:
# read in the data
dfStage = read.csv("reshapeR/Data/stage.csv", header = FALSE, stringsAsFactor = FALSE)
# remove the rows which are min, max, mean & redundant columns
condMMM = stringr::str_trim(dfStage[, 1]) %in% c("Min", "Max", "Mean", "Day")
dfStage = dfStage[!condMMM, 1:13]
dateVars = c("Day", "Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec")
colnames(dfStage) = dateVars
# get indices & names of year site combinations
condlSiteYear = grepl("^Daily means", stringr::str_trim(dfStage[, 1]))
condiSiteYear = grep("^Daily means", stringr::str_trim(dfStage[, 1]))
dfSiteYear = dfStage[condlSiteYear, 1, drop = FALSE]
# remove site-year rows from data
dfStage = dfStage[!condlSiteYear, ]
# get the list of sites and years
dfSiteYear$Year = regmatches(dfSiteYear[, 1], regexpr("(?<=Year\\s)([0-9]+)", dfSiteYear[, 1], perl = TRUE))
dfSiteYear$Site = regmatches(dfSiteYear[, 1],
regexpr("(?<=(Stage\\s\\(mm\\)\\sat\\s))([A-Za-z\\s0-9\\.]+)", dfSiteYear[, 1], perl = TRUE))
# add the site and years
dfSiteYearLong = dfSiteYear[rep.int(1:dim(dfSiteYear)[1], 31), c("Site", "Year")]
dfStageFinal = cbind(dfStage, dfSiteYearLong)
# reshape
dfStageFinalLong = reshape2::melt(dfStageFinal, id.vars = c("Day", "Site", "Year"), measure.vars = dateVars[-1],
variable.name = "Month")
dfStageFinalWide = dcast(dfStageFinalLong, Day + Month + Year ~ Site, value.var = "value")
# cleanup
dfStageFinalWide[, -c(1:3)] = lapply(dfStageFinalWide[, -c(1:3)], as.numeric)
# create a date variable
dfStageFinalWide$Date = with(dfStageFinalWide,
as.Date(paste(Day, Month, Year, sep = "-"),
format = "%d-%b-%Y"))
# remove the infeasible dates
dfStageFinalWide = dfStageFinalWide[!is.na(dfStageFinalWide$Date), ]
dfStageFinalWide = dfStageFinalWide[order(dfStageFinalWide$Date), ]
# plot the values over time
dfStageFinalLong =
melt(dfStageFinalWide, id.vars = "Date", measure.vars = unique(dfSiteYear$Site),
variable.name = "Site")
ggplot(dfStageFinalLong, aes(x = Date, y = value, color = Site))+
geom_line() + theme_bw() + facet_wrap(~ Site, scale = "free_y")
Output:
Here is what the output looks like:
> head(dfStageFinalWide)
Day Month Year Kumeti at Te Rehunga Makakahi at Hamua Makuri at Tuscan Hills Manawatu at Hopelands Manawatu at Upper Gorge Manawatu at Weber Road Mangahao at Ballance
1 1 Jan 1990 454 NA 700 5133 NA NA NA
2 1 Jan 1991 1002 3643 1416 50 3597 1836 18160
3 1 Jan 1992 3490 34239 8922 3049 1221 417 NA
4 1 Jan 1993 404 NA 396 3408 NA 272 NA
5 1 Jan 1994 NA NA 3189 795 NA 2321 1889
6 1 Jan 1995 16548 1923 69862 4808 NA 6169 94
Mangapapa at Troup Rd Mangatainoka at Larsons Road Mangatainoka at Pahiatua Town Bridge Mangatainoka at Tararua Park Mangatoro at Mangahei Road Oruakeretaki at S.H.2 Napier
1 9406 2767 NA NA 6838 2831
2 4985 2479 823 1078 76 105
3 478 3665 1415 210 394 8247
4 6394 1298 NA 2668 3837 1878
5 14051 3561 NA 2645 807 NA
6 NA 1057 7029 4497 NA NA
Raparapawai at Jackson Rd Tamaki at Stephensons Tiraumea at Ngaturi
1 5189 50444 17951
2 345 416 3025
3 1364 5713 1710
4 3457 28078 8670
5 199 NA 292
6 NA NA 22774
And a picture to bring it all together.