Related
M_aff <- API_info$affil_info names(API_info$affil_info) <- c("entry_number", "doi", "affiliation_name", "affiliation_city", "affiliation_country", "affiliation_id") filter(is.na(as.numeric(gsub("([0-9]+).*$", "\1", affiliation_name)))) %>% mutate(affiliation_name = str_replace_all(affiliation_name, "&", "&"), affiliation_name = stringi::stri_trans_general(affiliation_name, "ASCII"), affiliation_name = str_replace_all(affiliation_name, "\.|\(|\)|\-|\\", " "), affiliation_name = trimws(affiliation_name, which = "both")) %>% filter(str_detect(affiliation_name, "netflix")) %>% distinct(entry_number, affiliation_id, .keep_all = T) %>% left_join(cleaned_data %>% select(entry_number, citations, year), by = "entry_number")
ERROR:Error in is.factor(x) : object 'affiliation_name' not found
The M_aff is the set of observations with 6 variables.
The "netflix" file is the file with the raw data.
I also am not sure if i should be using filter, I want to clean my data (remove the false positives): the "netflix" file that is a list of documents published by netflix.
I extracted the Netflix file as following:
api_key <- '26d72e4e014f3c18226cd4bf29725557' set_api_key(api_key) netflix <- scopus_search(api_key ='26d72e4e014f3c18226cd4bf29725557', query = "affil(netflix)", count=1) df_netflix = gen_entries_to_df(netflix$entries) save(df_netflix, file = paste(getwd(), 'netflix/input_data/df_netflix.RData', sep='/'))
Then I applied the following code to clean:
cleaned_data <- df_netflix$df %>%
select("eid", "dc:title", "dc:creator", "prism:publicationName", "prism:issn", "prism:eIssn", "prism:coverDate", "prism:doi",
"citedby-count", "subtypeDescription", "entry_number") %>%
rename(title = "dc:title",
creator = "dc:creator",
journal = "prism:publicationName",
date = "prism:coverDate",
doi = "prism:doi",
citations = "citedby-count",
doc_type = "subtypeDescription",
issn_og = "prism:issn",
eissn_og = "prism:eIssn") %>%
mutate(year = str_sub(date,start=1, end=4),
issn = case_when(is.na(issn_og) & !is.na(eissn_og) ~ eissn_og,
!is.na(issn_og) & is.na(eissn_og) ~ issn_og,
!is.na(issn_og) & !is.na(eissn_og) ~ paste(issn_og, eissn_og, sep=', '),
TRUE ~ 'NA'
)) %>%
left_join(sjrdata::sjr_journals %>% select(year, issn, sjr, sjr_best_quartile, h_index), by=c('year','issn'))
openxlsx::write.xlsx(cleaned_data, file= paste(getwd(), 'netflix/output_data/netflix.xlsx', sep='/'))
save(cleaned_data, file = paste(getwd(), 'netflix/input_data/cleaned_data.RData', sep='/'))
api_key <- '26d72e4e014f3c18226cd4bf29725557'
set_api_key(api_key)
doi_not_na <-
cleaned_data %>%
as_tibble() %>%
select(entry_number, doi) %>%
filter(!is.na(doi))
API_info <- get_api_info(doi_not_na)
save(API_info, file = paste(getwd(), "netflix/input_data/API_info_netflix.RData", sep = "/"))
Then I was trying to continue to clean data as following:
M_aff <-
API_info$affil_info
names(API_info$affil_info) <- c("entry_number", "doi", "affiliation_name", "affiliation_city", "affiliation_country", "affiliation_id")
filter(is.na(as.numeric(gsub("([0-9]+).*$", "\\1", affiliation_name)))) %>%
mutate(affiliation_name = str_replace_all(affiliation_name, "&", "&"),
affiliation_name = stringi::stri_trans_general(affiliation_name, "ASCII"),
affiliation_name = str_replace_all(affiliation_name, "\\.|\\(|\\)|\\-|\\\\", " "),
affiliation_name = trimws(affiliation_name, which = "both")) %>%
filter(str_detect(affiliation_name, "netflix")) %>%
distinct(entry_number, affiliation_id, .keep_all = T) %>%
left_join(cleaned_data %>%
select(entry_number, citations, year),
by = "entry_number")
true_positive= M_aff%>%
filter(grepl(pattern = 'netflix', x = tolower(affiliation_id)))%>%
select(doi)%>%
distinct()%>%
pull(doi)
M_aff=M_aff%>%
filter(doi %in% true_positive)
API_info$author_info= API_info$author_info%>%
filter(doi %in% true_positive)
API_info$subject_info=API_info$subject_info%>%
filter(doi %in% true_positive)
API_info$authkey_info= API_info$authkey_info%>%
filter(doi %in% true_positive)
save(M_aff, file = paste(getwd(), "netflix/input_data/M_aff.RData", sep = "/"))
openxlsx::write.xlsx(M_aff, file= paste(getwd(), 'netflix/output_data/netflix_aff.xlsx', sep='/'))
openxlsx::write.xlsx(API_info$author_info, file= paste(getwd(), 'netflix/output_data/netflix_auth.xlsx', sep='/'))
openxlsx::write.xlsx(API_info$subject_info, file= paste(getwd(), 'netflix/output_data/netflix_subject.xlsx', sep='/'))
openxlsx::write.xlsx(API_info$authkey_info, file= paste(getwd(), 'netflix/output_data/netflix_authkey.xlsx', sep='/'))
I have a data.frame that looks like this:
test <- data.frame(ID = c('1','1','1','1','1','1','1','1','2','2','2','2','2','2','2','2',
'3','3','3','3','3','3','3','3','4','4','4','4','4','4','4','4',
'5','5','5','5','5','5','5','5','6','6','6','6','6','6','6','6'),
CAT = c('CAT1','CAT1','CAT1','CAT1','CAT2','CAT2','CAT2','CAT2',
'CAT1','CAT1','CAT1','CAT1','CAT2','CAT2','CAT2','CAT2',
'CAT1','CAT1','CAT1','CAT1','CAT2','CAT2','CAT2','CAT2',
'CAT1','CAT1','CAT1','CAT1','CAT2','CAT2','CAT2','CAT2',
'CAT1','CAT1','CAT1','CAT1','CAT2','CAT2','CAT2','CAT2',
'CAT1','CAT1','CAT1','CAT1','CAT2','CAT2','CAT2','CAT2'),
CODE = c('code1','code2','code3','code4','code1','code2','code3','code4',
'code1','code2','code3','code4','code1','code2','code3','code4',
'code1','code2','code3','code4','code1','code2','code3','code4',
'code1','code2','code3','code4','code1','code2','code3','code4',
'code1','code2','code3','code4','code1','code2','code3','code4',
'code1','code2','code3','code4','code1','code2','code3','code4'),
DATE = c('date1', 'date2', 'date3','date4','date1','date2','date3','date4',
'date1', 'date2', 'date3','date4','date1','date2','date3','date4',
'date1', 'date2', 'date3','date4','date1','date2','date3','date4',
'date1', 'date2', 'date3','date4','date1','date2','date3','date4',
'date1', 'date2', 'date3','date4','date1','date2','date3','date4',
'date1', 'date2', 'date3','date4','date1','date2','date3','date4'),
stringsAsFactors = F)
I would like to have like following:
[
{"id": 1,
"CAT1": ['code1', 'code2','code3', 'code4'],
"CAT1_dates": ['date1', 'date2','date3','date4'],
"CAT2": ['code1', 'code2','code3', 'code4'],
"CAT2_dates": ['date1', 'date2','date3','date4'],
}
{"id": 2,
"CAT1": ['code1', 'code2','code3', 'code4'],
"CAT1_dates": ['date1', 'date2','date3','date4'],
"CAT2": ['code1', 'code2','code3', 'code4'],
"CAT2_dates": ['date1', 'date2','date3','date4'],
}
]
I understood that i need to write a function to do that job. I was not successfull.
From dataFrame to grouped Json in R
convert date frame to json in R
One method using dplyr (for nesting and pivoting) and jsonlite:
library(dplyr)
library(tidyr) # pivot_wider
# library(jsonlite)
test %>%
group_by(ID, CAT) %>%
summarize(x = list(CODE), x_dates = list(DATE)) %>%
pivot_wider(ID, names_from = "CAT", values_from = c("x", "x_dates"),
names_glue = "{CAT}{gsub('^x','',.value)}") %>%
ungroup() %>%
jsonlite::toJSON(pretty = TRUE)
# [
# {
# "ID": "1",
# "CAT1": ["code1", "code2", "code3", "code4"],
# "CAT2": ["code1", "code2", "code3", "code4"],
# "CAT1_dates": ["date1", "date2", "date3", "date4"],
# "CAT2_dates": ["date1", "date2", "date3", "date4"]
# },
# {
# "ID": "2",
# "CAT1": ["code1", "code2", "code3", "code4"],
# "CAT2": ["code1", "code2", "code3", "code4"],
# "CAT1_dates": ["date1", "date2", "date3", "date4"],
# "CAT2_dates": ["date1", "date2", "date3", "date4"]
# },
# ...truncated...
# ]
It's certainly feasible to do this in base R or data.table if needed, though admittedly not as smoothly as that.
Not a json expert, but I need what I think is referred to as "nested objects" and I am getting instead what I think is referred to as "nested arrays". In other words, some extra brackets. I'm trying to convert a dataframe into json data using jsonlite in R. Reproducible code and results below. Can anyone point me to how to get the data in the proper format (rows as nested objects)?
library(jsonlite)
testdat <- data.frame(locationColumn = c("US", "US"),
nameColumn = c("General Motors", "Walmart"),
zipColumn = c(19890, 72712) )
jsl <- jsonlite::toJSON(
list(
config = list(
item1 = list("country",
"city"),
item2 = "true",
item3 = "false",
item4 = 3
),
rows = split(testdat, 1:nrow(testdat))
),
auto_unbox = TRUE,
pretty = TRUE,
dataframe = "rows",
simplifyDataFrame = TRUE
)
jsl
Output:
{
"config": {
"item1": [
"country",
"city"
],
"item2": "true",
"item3": "false",
"item4": 3
},
"rows": {
"1": [
{
"locationColumn": "US",
"nameColumn": "General Motors",
"zipColumn": 19890
}
],
"2": [
{
"locationColumn": "US",
"nameColumn": "Walmart",
"zipColumn": 72712
}
]
}
}
What I need: (EDIT: I added some more complexity to the json. I need to keep the brackets in 'config', but not have brackets in 'rows'.
{
"config": {
"item1": [
"country",
"city"
],
"item2": "true",
"item3": "false",
"item4": 3
},
"rows": {
"1":
{
"locationColumn": "US",
"nameColumn": "General Motors",
"zipColumn": 19890
},
"2":
{
"locationColumn": "US",
"nameColumn": "Walmart",
"zipColumn": 72712
}
}
}
Here is a possible solution:
library(jsonlite)
testdat <- data.frame(locationColumn = c("US", "US"),
nameColumn = c("General Motors", "Walmart"),
zipColumn = c(19890, 72712) )
jsl <- jsonlite::toJSON(
list(
rows = split(testdat, 1:nrow(testdat))
),
auto_unbox = TRUE,
pretty = TRUE,
dataframe = "columns", #change from rows (moves brackets from row level to value level)
simplifyDataFrame = TRUE
)
#removed the backets if desired
#jsl<-gsub("\\[|\\]", "", jsl)
all.equal(testcase, fromJSON(jsl))
testcase<-fromJSON('{
"rows": {
"1":{
"locationColumn": "US",
"nameColumn": "General Motors",
"zipColumn": 19890
},
"2":{
"locationColumn": "US",
"nameColumn": "Walmart",
"zipColumn": 72712
}
}
}')
all.equal(testcase, fromJSON(jsl))
#[1] TRUE
EDIT Here is an approved version that manually edits the list of list in order to obtain the correct format.
#create a list of the data
top<-list(
config = list(
item1 = list("country",
"city"),
item2 = "true",
item3 = "false",
item4 = 3
),
rows = split(testdat, 1:nrow(testdat))
)
#edit the data frames store as part of rows
#lapply - lapply loops will parse each column in each row to create a new list
rows<-lapply(top$rows, function(x){
tempdf<-x
#collist<-lapply(names(tempdf), function(y){print(tempdf[ , y, drop=T])})
collist<-lapply(names(tempdf), function(y){tempdf[, y, drop=T]})
names(collist)<-names(tempdf)
collist
})
#update the list with the list of list
top$rows<-rows
#make the JSON
jsl <- jsonlite::toJSON(
top,
auto_unbox = TRUE,
pretty = TRUE,
dataframe = "columns",
simplifyDataFrame = TRUE
)
I have a section in my Shiny app that generates a list.
names of the list are column names of the dataframe we will calculate on,
list items contain the calculations we want
Looking to do this:
apply to all list names:
for listname (column) x calculate function n,m,o over df column x
and name the resulting column 'x.n' i.e. 'cyl.mean', 'mpg.sum'
to get a dataframe of summary statistics PER GROUP (mtcars$cyl) in this case as example
It is linked to this question, but there the example data used a separate list of column names, and apply the same functions to all those columns from another list. I'm looking to move forward to apply unique sets of functions to different columns
The list my app spits out looks like this:
mylist
$disp
[1] "sum" "mean"
$hp
[1] "sd"
$drat
[1] "sum" "mean"
$wt
[1] "max"
expected output:
cyl disp.sum hp.sd drat.sum drat.mean wt.max
4 x ....
6 x ....
8 x ....
The little Shiny app to create the list:
library(shiny)
library(data.table)
library(shinyjs)
Channels <- names(mtcars)[3:8]
ui <- fluidPage(
shinyjs::useShinyjs(),
h5('Channels', style = 'font-weight:bold'),
uiOutput('ChannelCheckboxes'),
h5('Statistics', style = 'font-weight:bold'),
uiOutput('CalculationCheckboxes')
)
server <- function(input, output, session) {
values <- reactiveValues(Statisticlist = list())
## build observer to deselect all sub category checkboxes if channel is deselected
lapply(Channels, function(x) {
observeEvent(input[[paste('Channel', x, sep = '')]], {
if(!input[[paste('Channel', x, sep = '')]]) {
shinyjs::disable(paste("Calculations", x, sep = ''))
updateCheckboxGroupInput(session, inputId = paste("Calculations", x, sep = ''), selected=character(0))
} else {
shinyjs::enable(paste("Calculations", x, sep = ''))
}
})
})
output$ChannelCheckboxes <- renderUI({
fluidRow(
lapply(Channels, function(x) {
column(2,
checkboxInput(inputId = paste('Channel', x, sep = ''), label = x)
)
})
)
})
output$CalculationCheckboxes <- renderUI({
fluidRow(
lapply(Channels, function(x) {
column(2,
checkboxGroupInput(inputId = paste("Calculations", x, sep = ''), label = NULL, c('sum', 'mean', 'length', 'max', 'min', 'sd')) ) })
)
})
lapply(Channels, function(x) {
observe({
req(input[[paste('Channel', x, sep = '')]])
if(input[[paste('Channel', x, sep = '')]] & !is.null(input[[paste("Calculations", x, sep = '')]])){
values$Statisticlist[[paste(x)]] <- input[[paste("Calculations", x, sep = "")]]
}
})
})
observeEvent(values$Statisticlist, { print(values$Statisticlist)
mylist <<- values$Statisticlist
})
}
shinyApp(ui, server)
If I understand correctly, the question is not about shiny in first place but about how to apply different aggregation functions to specific columns of a data.table.
The names of the columns and the functions which are to be applied on are given as list mylist which is created by the shiny app.
Among the various approaches my preferred option is to compute on the language, i.e., to create a complete expression from the contents of mylist and to evaluate it:
library(magrittr)
library(data.table)
mylist %>%
names() %>%
lapply(
function(.col) lapply(
mylist[[.col]],
function(.fct) sprintf("%s.%s = %s(%s)", .col, .fct, .fct, .col))) %>%
unlist() %>%
paste(collapse = ", ") %>%
sprintf("as.data.table(mtcars)[, .(%s), by = cyl]", .) %>%
parse(text = .) %>%
eval()
which yields the expected result
cyl disp.sum disp.mean hp.sd drat.sum drat.mean wt.max
1: 6 1283.2 183.3143 24.26049 25.10 3.585714 3.460
2: 4 1156.5 105.1364 20.93453 44.78 4.070909 3.190
3: 8 4943.4 353.1000 50.97689 45.21 3.229286 5.424
The character string which is parsed is created by
mylist %>%
names() %>%
lapply(
function(.col) lapply(
mylist[[.col]],
function(.fct) sprintf("%s.%s = %s(%s)", .col, .fct, .fct, .col))) %>%
unlist() %>%
paste(collapse = ", ") %>%
sprintf("as.data.table(mtcars)[, .(%s), by = cyl]", .)
and looks as if coded manually:
[1] "as.data.table(mtcars)[, .(disp.sum = sum(disp), disp.mean = mean(disp), hp.sd = sd(hp), drat.sum = sum(drat), drat.mean = mean(drat), wt.max = max(wt)), by = cyl]"
Data
For demonstration, mylist is provided "hard-coded":
mylist <- list(
disp = c("sum", "mean"),
hp = "sd",
drat = c("sum", "mean"),
wt = "max")
To turn Uwe's answer into a function I did this:
Summarystats <- function(statlist, dataframe, group) {
statlist %>%
names() %>%
lapply(
function(.col) lapply(
statlist[[.col]],
function(.fct) sprintf("%s.%s = %s(%s)", .col, .fct, .fct, .col))) %>%
unlist() %>%
paste(collapse = ", ") %>%
sprintf("as.data.table(dataframe)[, .(%s), by = group]", .) %>%
parse(text = .) %>%
eval()
}
Now I can call:
Summarystats(mylist, mtcars, 'cyl')
allowing me to call a summary table for whichever dataframe and grouping the user wants in my Shiny App.
I have a csv file with the locations of bicicle stations in four columns: "long", "lat", "nro_est", "nombre". I want to create a geojson fil from this csv file doing this:
as.json(geojson_list(estaciones, lat = 'lat', long = 'long'), pretty = TRUE)
the example from ?geojson_list with data "states" is very similar but instead of creating points it creates polygons:
geojson_list(states[1:351,], geometry="polygon", group='group')
My problem with my adaption brings this error. It takes "nro_est" and "nombre" as coordinates and also it shows "long" as propertie. There is an NA that i dont understand. What am i doing wrong?
"geometry": {
"type": "Point",
"coordinates": [-58.40436, -34.58819, 200, "NA"]
},
"properties": {
"long": "-58.40436",
"nro_est": "200",
"nombre": "Austria y French"
}
You can use library(geojsonsf) to convert a data.frame of lon/lat columns into geojson (with POINT geometries)
In the absence of your data I'm creating a dummy example to show you how it works.
library(geojsonsf)
estaciones <- data.frame(
lat = rnorm(3)
, long = rnorm(3)
, nombre = sample(letters, size = 3)
, nro_est = 1:3
)
geojson <- df_geojson( df = estaciones, lat = "lat", lon = "long" )
jsonify::pretty_json( geojson )
# {
# "type": "FeatureCollection",
# "features": [
# {
# "type": "Feature",
# "properties": {
# "nombre": "q",
# "nro_est": 1
# },
# "geometry": {
# "type": "Point",
# "coordinates": [
# 0.6266271502100352,
# -0.3347400043557775
# ]
# }
# },
# {
# "type": "Feature",
# "properties": {
# "nombre": "a",
# "nro_est": 2
# },
# "geometry": {
# "type": "Point",
# "coordinates": [
# 1.1169599023314834,
# 2.593126207650351
# ]
# }
# },
# {
# "type": "Feature",
# "properties": {
# "nombre": "y",
# "nro_est": 3
# },
# "geometry": {
# "type": "Point",
# "coordinates": [
# 0.18108997460569566,
# 0.4805016218807452
# ]
# }
# }
# ]
# }