I have a data.frame that looks like this:
test <- data.frame(ID = c('1','1','1','1','1','1','1','1','2','2','2','2','2','2','2','2',
'3','3','3','3','3','3','3','3','4','4','4','4','4','4','4','4',
'5','5','5','5','5','5','5','5','6','6','6','6','6','6','6','6'),
CAT = c('CAT1','CAT1','CAT1','CAT1','CAT2','CAT2','CAT2','CAT2',
'CAT1','CAT1','CAT1','CAT1','CAT2','CAT2','CAT2','CAT2',
'CAT1','CAT1','CAT1','CAT1','CAT2','CAT2','CAT2','CAT2',
'CAT1','CAT1','CAT1','CAT1','CAT2','CAT2','CAT2','CAT2',
'CAT1','CAT1','CAT1','CAT1','CAT2','CAT2','CAT2','CAT2',
'CAT1','CAT1','CAT1','CAT1','CAT2','CAT2','CAT2','CAT2'),
CODE = c('code1','code2','code3','code4','code1','code2','code3','code4',
'code1','code2','code3','code4','code1','code2','code3','code4',
'code1','code2','code3','code4','code1','code2','code3','code4',
'code1','code2','code3','code4','code1','code2','code3','code4',
'code1','code2','code3','code4','code1','code2','code3','code4',
'code1','code2','code3','code4','code1','code2','code3','code4'),
DATE = c('date1', 'date2', 'date3','date4','date1','date2','date3','date4',
'date1', 'date2', 'date3','date4','date1','date2','date3','date4',
'date1', 'date2', 'date3','date4','date1','date2','date3','date4',
'date1', 'date2', 'date3','date4','date1','date2','date3','date4',
'date1', 'date2', 'date3','date4','date1','date2','date3','date4',
'date1', 'date2', 'date3','date4','date1','date2','date3','date4'),
stringsAsFactors = F)
I would like to have like following:
[
{"id": 1,
"CAT1": ['code1', 'code2','code3', 'code4'],
"CAT1_dates": ['date1', 'date2','date3','date4'],
"CAT2": ['code1', 'code2','code3', 'code4'],
"CAT2_dates": ['date1', 'date2','date3','date4'],
}
{"id": 2,
"CAT1": ['code1', 'code2','code3', 'code4'],
"CAT1_dates": ['date1', 'date2','date3','date4'],
"CAT2": ['code1', 'code2','code3', 'code4'],
"CAT2_dates": ['date1', 'date2','date3','date4'],
}
]
I understood that i need to write a function to do that job. I was not successfull.
From dataFrame to grouped Json in R
convert date frame to json in R
One method using dplyr (for nesting and pivoting) and jsonlite:
library(dplyr)
library(tidyr) # pivot_wider
# library(jsonlite)
test %>%
group_by(ID, CAT) %>%
summarize(x = list(CODE), x_dates = list(DATE)) %>%
pivot_wider(ID, names_from = "CAT", values_from = c("x", "x_dates"),
names_glue = "{CAT}{gsub('^x','',.value)}") %>%
ungroup() %>%
jsonlite::toJSON(pretty = TRUE)
# [
# {
# "ID": "1",
# "CAT1": ["code1", "code2", "code3", "code4"],
# "CAT2": ["code1", "code2", "code3", "code4"],
# "CAT1_dates": ["date1", "date2", "date3", "date4"],
# "CAT2_dates": ["date1", "date2", "date3", "date4"]
# },
# {
# "ID": "2",
# "CAT1": ["code1", "code2", "code3", "code4"],
# "CAT2": ["code1", "code2", "code3", "code4"],
# "CAT1_dates": ["date1", "date2", "date3", "date4"],
# "CAT2_dates": ["date1", "date2", "date3", "date4"]
# },
# ...truncated...
# ]
It's certainly feasible to do this in base R or data.table if needed, though admittedly not as smoothly as that.
Related
I have got several Excel files (see link: we.tl/t-qJl3kVcY0j) that I combine together.
Each Excel file have columns from A to AF as below:
t-ph
Load
HR
BF
V'E
V'O2
V'CO2
d O2/dW
RER
EqO2
EqCO2
PETCO2
VES (ml)
VESi (ml/m²)
FC (bpm)
QC (l/min)
IC (l/min/m²)
PAS (mmHg)
PAD (mmHg)
PAM (mmHg)
ICT
TEV (ms)
RPD (%)
WCI (kg.m/m²)
RVSi (dyn.s/cm5.m²)
RVS (dyn.s/cm5)
VTD est (ml)
FE est (%)
O2Hb
HHb
tHb
HbDiff
My previous code was working, but since I have added columns AC (HHb) to AF (HbDiff), I can't load them anymore. I have tried to match the number of columns, the title, but still not. I can't produce a reproducible example since I can't load my data.
Here is the code I used:
pacman::p_load(tidyverse, readxl, ggpubr)
library(dplyr)
library(ggplot2)
library(afex) ##statistic package
# load data and format
load_files <- function(files){
temp <- read_excel(files) %>%
select(-(c(8:11, 14:15, 18:23))) %>%
mutate(id = pull(.[4,1])) %>% ##ID
mutate(body_mass = pull(.[7,3])) %>% ##body mass
mutate(training = pull(.[4,2])) %>% ##training group
set_names(c("time", "power", "hr", "fr", "VE", "absVO2", "VCO2", "PETCO2", "VES", "QC", "IC", "WCI", "RVSi", "RVS", "VTD", "FE", "O2Hb", "HHb", "tHb", "HbDiff", "id", "body_mass", "training")) %>%
slice(86:which(grepl("ration", VE))-1) %>% ##until recovery period
mutate_at(vars(1:16), as.numeric) %>%
mutate_at(vars(18), as.numeric) %>%
mutate(time = format(as.POSIXct(Sys.Date() + time), "%H:%M", tz="UTC"),
absVO2 = absVO2/1000,
VCO2 = VCO2/1000)
}
# apply function to all files
df <- map_df(file_list, load_files)
# remove those with who have less than four similar power
df <- df %>%
mutate(len_seq = rep(rle(power)$lengths, rle(power)$lengths)) %>%
filter(len_seq == 4) %>%
mutate(seq_id = rep(1:(n()/4), each = 4)) %>%
group_by(id) %>%
select(-seq_id)%>%
select(-(20))
# group data
df_sum <- df %>%
type.convert(as.is = TRUE) %>%
group_by(id, power, training) %>%
summarise_if(is.numeric, mean) %>%
group_by(id) %>%
mutate(percent_absVO2 = absVO2/max(absVO2)*100,
percent_power = power/max(power)*100,
percent_QC = QC/max(QC)*100,
percent_SV = VES/max(VES)*100,
percent_VCO2 = VCO2/max(VCO2)*100,
percent_VE = VE/max(VE)*100) %>%
mutate(VE_VO2 = VE/absVO2,
VE_VCO2 = VE/VCO2) %>%
mutate(RER = VCO2/absVO2, VT = VE/fr) %>%
mutate(relVO2 = absVO2/body_mass*1000,
percent_relVO2 = relVO2/max(relVO2)*100) %>%
mutate(BF = VE/VT) %>%
mutate(mech_perf = (power/(((0.003*power+0.1208)*1000*body_mass)/60))*100) %>%
mutate(group = ifelse(grepl(".*-PRD-C", id), "CAD", "Healthy")) %>%
mutate(temps = ifelse(grepl(".*-PRD-C1", id), "1", ifelse(grepl(".*-PRD-S1", id), "1", "2")))
Then the outcome I get:
Error in `set_names()`:
! The size of `nm` (23) must be compatible with the size of `x` (20).
Run `rlang::last_error()` to see where the error occurred.
Thank you very much for your precious help.
I have following extract of my dataset:
library(dyplr)
library(runner)
example <- data.frame(Date <- c("2020-03-24", "2020-04-06" ,"2020-04-08" ,
"2020-04-13", "2020-04-14", "2020-04-15",
"2020-04-16", "2020-04-18", "2020-04-23",
"2020-04-24", "2020-04-26", "2020-04-29",
"2020-03-24", "2020-04-06" ,"2020-04-08" ,
"2020-04-01", "2020-04-12", "2020-04-15",
"2020-04-17", "2020-04-18", "2020-04-22",
"2020-05-01", "2020-05-15", "2020-05-29",
"2020-03-08", "2020-04-06" ,"2020-04-15",
"2020-04-22", "2020-04-28", "2020-05-05",
"2020-05-08", "2020-05-22", "2020-05-23"),
username <- c("steves_" ,"steves_" ,"steves_",
"steves_" ,"steves_" ,"steves_",
"steves_" ,"steves_" ,"steves_",
"steves_" ,"steves_" ,"steves_",
"jules_" ,"jules_" ,"jules_",
"jules_" ,"jules_" ,"jules_",
"jules_" ,"jules_" ,"jules_",
"jules_" ,"jules_" ,"jules_",
"mia" ,"mia" ,"mia",
"mia" ,"mia" ,"mia",
"mia" ,"mia" ,"mia"),
ER <- as.numeric(c("0.092", "0.08", "0.028",
"0.1", "0.09", "0.02",
"0.02", "0.8", "0.001",
"0.001", "0.1", "0.098",
"0.001", "0.002","0.02",
"0.0098", "0.002","0.0019",
"0.002", "0.11","0.002",
"0.02", "0.01", "0.009",
"0.19", "0.09", "0.21",
"0.22", "0.19", "0.22",
"0.09", "0.19", "0.28")))
colnames(example) <- c("Date", "username", "ER")
example$Date <- as.Date(example$Date)
str(example)
I would like to calculate the respective average of the ER over a month from the respective dates.
I know that there are similar contributions to this already in the forum - but unfortunately I could not find the solution for me.
I have tried the following solutions:
example$avgER_30days <- example %>%
arrange(username, Date) %>%
group_by(username) %>%
mutate(rollmean(example$ER, Date > (Date %m-% months(1)) & Date < Date, fill = NA))
or with the package runners
example$average <- example %>%
group_by(username) %>%
arrange(username, Date) %>%
mutate(mean_run(x = example$ER, k = 30, lag = 1, idx=example$Date)) %>%
ungroup(username)
I would be happy if you could help me!
Here are two equivalent alternatives.
In the first alternative below, the second argument to rollapplyr is a list such that the ith component is the vector of offsets to average over for the ith row of the group.
In the second alternative we can specify the width as a vector of widths, one per row, and then when taking the mean eliminate the last value.
Note that w is slightly different in the two alternatives.
Review ?rollapply for details on the arguments and for further examples.
library(dplyr, exclude = c("filter", "lag"))
library(zoo)
example %>%
arrange(username, Date) %>%
group_by(username) %>%
mutate(w = seq_along(Date) - findInterval(Date - 30, Date) - 1,
avg30 = rollapplyr(ER, lapply(-w, seq, to = -1), mean, fill=NA)) %>%
ungroup
example %>%
arrange(username, Date) %>%
group_by(username) %>%
mutate(w = seq_along(Date) - findInterval(Date - 30, Date),
avg30 = rollapplyr(ER, w, function(x) mean(head(x, -1)), fill = NA)) %>%
ungroup
Hi i am trying to convert the coefficients of a linear model into an json file.I have converted to an ject first and then convert it to a json file.I have multiple factors but only one factor output is able to write to the json file.Any leads will be helpful.
mod
fs<-summary(mod)
df<-fs$coefficients
my_json<-jsonlite::toJSON(df,force=TRUE,pretty=TRUE)
#print(my_json)
write(my_json,"exportnew.JSON")
library(tidyverse)
library(jsonlite)
lm(Sepal.Length ~ Species + Sepal.Width, data = iris) %>%
coefficients() %>%
enframe() %>%
write_json("model_coeffs.json", pretty = TRUE)
Resulting in file model_coeffs.json with content:
[
{
"name": "(Intercept)",
"value": 2.2514
},
{
"name": "Speciesversicolor",
"value": 1.4587
},
{
"name": "Speciesvirginica",
"value": 1.9468
},
{
"name": "Sepal.Width",
"value": 0.8036
}
]
You can nest specific variables e.g.
library(tidyverse)
library(jsonlite)
factor <- mpg %>% colnames() %>% paste0(collapse = "|")
lm(displ ~ manufacturer + class + year, data = mpg) %>%
coefficients() %>%
enframe() %>%
mutate(factor = name %>% str_extract(factor)) %>%
nest(-factor) %>%
mutate(data = data %>% map(~ as.numeric(.$value))) %>%
write_json("model_coeffs.json", pretty = TRUE)
resulting in
[
{
"data": [-50.2851]
},
{
"factor": "manufacturer",
"data": [1.246, 1.2091, 1.1909, -1.1599, -0.3859, 0.7445, 0.5315, 1.6728, 0.6315, 0.2552, 1.276, -0.7175, -0.0727, -0.3671]
},
{
"factor": "class",
"data": [-2.3777, -2.1805, -2.6961, -1.4081, -2.0045, -1.1207]
},
{
"factor": "year",
"data": [0.0275]
}
]
i have a dataset:
x = data.frame(store=c("store1", "store1", "store1","store2","store2", "store3", "store3", "store4", "store4", "store4"),
pos=c("room1", "room2", "room2", "room1", "room1", "room1", "room1", "room2", "room2", "room3"),
error=c("error1", "error2", "error2", "error5", "error6", "error2", "error3", "error1", "error3", "error2"),
time = c("10:00:14", "10:00:44", "10:20:31", "10:24:11", "10:55:14", "10:20:10", "10:44:12", "10:04:34", "12:34:55", "10:12:17")
)
I want to select rows which have error2 or error5 in error column and maximum time in time column for each store and pos. How could i do it?
So new dataset must be like this:
x_new = data.frame(store=c("store1","store2", "store3", "store4"),
pos=c("room2", "room1", "room1", "room3"),
error=c("error2", "error5", "error2", "error2"),
time = c("10:20:31", "10:24:11", "10:20:10", "10:12:17")
)
library(tidyverse)
library(chron)
x %>%
mutate(time = chron::as.times(time)) %>%
group_by(store, pos, error) %>%
filter(error %in% c("error2", "error5")) %>%
summarise(time = max(time, na.rm = T))
Code below used to web scrape a website using API call. I just have to change the startDate and endDate to get data set that I want. Previously it works fine, doing its loops wonderfully but after I did some modification on the html_nodes() part - try to extract different section in the webpage, it keep return me data of the same date. with error warning 'Error in seq.int(0, to0 - from, by) : wrong sign in 'by' argument in r'. What should be done here?
library(tidyverse)
library(readr)
library(tidyr)
library(dplyr)
library(xlsx)
library(beepr)
get_sounding_data <- function(region = c("naconf", "samer", "pac", "nz", "ant",
"np", "europe", "africa", "seasia", "mideast"),
date,
from_hr = c("00", "12", "all"),
to_hr = c("00", "12", "all"),
station_number = 48615) {
# we use these pkgs (the readr and dplyr dependencies removed)
suppressPackageStartupMessages({
require("xml2", quietly = TRUE)
require("httr", quietly = TRUE)
require("rvest", quietly = TRUE)
})
# validate region
region <- match.arg(
arg = region,
choices = c(
"naconf", "samer", "pac", "nz", "ant",
"np", "europe", "africa", "seasia", "mideast"
)
)
# validates the date for us if it's a character string
date <- as.Date(date)
# get year and month
year <- as.integer(format(date, "%Y"))
stopifnot(year %in% 1973:as.integer(format(Sys.Date(), "%Y")))
year <- as.character(year)
month <- format(date, "%m")
# we need these to translate day & *_hr to the param the app needs
c(
"0100", "0112", "0200", "0212", "0300", "0312", "0400", "0412",
"0500", "0512", "0600", "0612", "0700", "0712", "0800", "0812",
"0900", "0912", "1000", "1012", "1100", "1112", "1200", "1212",
"1300", "1312", "1400", "1412", "1500", "1512", "1600", "1612",
"1700", "1712", "1800", "1812", "1900", "1912", "2000", "2012",
"2100", "2112", "2200", "2212", "2300", "2312", "2400", "2412",
"2500", "2512", "2600", "2612", "2700", "2712", "2800", "2812",
"2900", "2912", "3000", "3012", "3100", "3112"
) -> hr_vals
c(
"01/00Z", "01/12Z", "02/00Z", "02/12Z", "03/00Z", "03/12Z", "04/00Z",
"04/12Z", "05/00Z", "05/12Z", "06/00Z", "06/12Z", "07/00Z", "07/12Z",
"08/00Z", "08/12Z", "09/00Z", "09/12Z", "10/00Z", "10/12Z", "11/00Z",
"11/12Z", "12/00Z", "12/12Z", "13/00Z", "13/12Z", "14/00Z", "14/12Z",
"15/00Z", "15/12Z", "16/00Z", "16/12Z", "17/00Z", "17/12Z", "18/00Z",
"18/12Z", "19/00Z", "19/12Z", "20/00Z", "20/12Z", "21/00Z", "21/12Z",
"22/00Z", "22/12Z", "23/00Z", "23/12Z", "24/00Z", "24/12Z", "25/00Z",
"25/12Z", "26/00Z", "26/12Z", "27/00Z", "27/12Z", "28/00Z", "28/12Z",
"29/00Z", "29/12Z", "30/00Z", "30/12Z", "31/00Z", "31/12Z"
) -> hr_inputs
hr_trans <- stats::setNames(hr_vals, hr_inputs)
o_from_hr <- from_hr <- as.character(tolower(from_hr))
o_to_hr <- to_hr <- as.character(tolower(to_hr))
if ((from_hr == "all") || (to_hr == "all")) {
from_hr <- to_hr <- "all"
} else {
from_hr <- hr_trans[sprintf("%s/%02dZ", format(date, "%d"), as.integer(from_hr))]
match.arg(from_hr, hr_vals)
to_hr <- hr_trans[sprintf("%s/%02dZ", format(date, "%d"), as.integer(to_hr))]
match.arg(to_hr, hr_vals)
}
# clean up the station number if it was entered as a double
station_number <- as.character(as.integer(station_number))
# execute the API call
httr::GET(
url = "http://weather.uwyo.edu/cgi-bin/sounding",
query = list(
region = region,
TYPE = "TEXT:LIST",
YEAR = year,
MONTH = sprintf("%02d", as.integer(month)),
FROM = from_hr,
TO = to_hr,
STNM = station_number
)
) -> res
# check for super bad errors (that we can't handle nicely)
httr::stop_for_status(res)
# get the page content
doc <- httr::content(res, as="text")
# if the site reports no data, issue a warning and return an empty data frame
if (grepl("Can't get", doc)) {
doc <- xml2::read_html(doc)
msg <- rvest::html_nodes(doc, "body")
msg <- rvest::html_text(msg, trim=TRUE)
msg <- gsub("\n\n+.*$", "", msg)
warning(msg)
return(data.frame(stringsAsFactors=FALSE))
}
# if the site reports no data, issue a warning and return an empty data frame
if (grepl("Can't get", doc)) {
doc <- xml2::read_html(doc)
msg <- rvest::html_nodes(doc, "body")
msg <- rvest::html_text(msg, trim=TRUE)
msg <- gsub("\n\n+.*$", "", msg)
warning(msg)
return(data.frame(stringsAsFactors=FALSE))
}
# turn it into something we can parse
doc <- xml2::read_html(doc)
raw_dat <- doc %>%
html_nodes("h3+ pre") %>%
html_text()
indices <- raw_dat %>%
str_split(pattern = "\n", simplify = T) %>%
map_chr(str_squish) %>%
tibble(x = .) %>%
separate(x, into = c("Station", "Value"), sep = ": ") %>%
filter(!is.na(Value))
data <- tidyr::spread(indices, Station, Value)
data
}
startDate <- as.Date("01-11-1979", format="%d-%m-%y")
endDate <- as.Date("31-01-1980",format="%d-%m-%y")
days <- seq(startDate, endDate, "1 day")
lapply(days[1:92], function(day) {
get_sounding_data(
region = "seasia",
date = day,
from_hr = "00",
to_hr = "00",
station_number = "48615"
)
}) -> soundings_48615
warnings()
new_df <- map(soundings_48615, . %>% mutate_all(parse_guess))
dat <- bind_rows(new_df)
dat <- dat %>% separate(col =`Observation time`, into = c('Date', 'time'), sep = '/')
dat$Date <- as.Date(dat$Date, format = "%y%m%d")
#save in text file
write.csv(dat, 'c:/Users/Hp/Documents/yr/climatology/yr_SoundingIndexLowerPart/1979.csv')
get_sounding_data <- NULL
beep()
It seems that the error you are having right now, is date format based. More specific the following times
as.Date("01-11-1979", format="%d-%m-%y")
as.Date("31-01-1980",format="%d-%m-%y")
outputs
"2019-11-01"
"2019-01-31"
R's date/time format is international standard yyyy-mm-dd. As such the "2019-11-01" comes after "2019-01-31" time wise. As such this will cause sequence to fail if it tries to iterate 1 positive day at a time. The formatting is the problem here, for this to be solved there is a very simple solution. Always work in international standard date formats, as these will be recognized by (almost) all programs.
As such change the date sequence part of your code to
....
startDate <- as.Date("1979-11-01")
endDate <- as.Date("1980-01-31")
days <- seq(startDate, endDate, "1 day")
....
Note the format change. As for why it changes 1979 to 2019, i am not entirely sure, someone else might have a more intricate answer for this weird behaviour.
I got this error when I was trying to create a sequence of dates using the seq() function. Turns out I was putting the start and end dates in the wrong order.
This would give me the error:
seq(as.Date("2012-12-30"), as.Date("2011-01-04"), by="days")
Error in seq.int(0, to0 - from, by) : wrong sign in 'by' argument
But this would not give me the error and would give me my date sequence:
seq(as.Date("2011-12-30"), as.Date("2012-01-04"), by="days")