I have a series of steps that I will like to convert into a function, so I can apply it to data frames simply by calling them. Below is the code with some comments:
library("textreadr")
library("pdftools")
library("tidyverse")
library("tidytext")
library("textreadr")
library("tm")
# Create Data frame
Off_let_data <- data.frame(page_id = c(3,3,3,3,3), element_id = c(19, 22, 26, 31, 31),
text = c("The Protected Percentage of your property value thats has been chosen is 0%",
"The Arrangement Fee payable at complettion: £50.00",
"The Fixed Interest Rate that is applied for the life of the period is: 5.40%",
"The Benchmark rate that will be used to calculate any early repayment 2.08%",
"The property value used in this scenario is 275,000.00"))
# read in the first element of a list of pdf file from a folder
files <- list.files(pattern = "pdf$")[1]
# extract the account number from the first pdf file
acc_num <- str_extract(files, "^\\d+")
# The RegEx's used to extract the relevant information
protec_per_reg <- "Protected\\sP\\w+\\sof"
Arr_Fee_reg <- "^The\\sArrangement\\sF\\w+"
Fix_inter_reg <- "Fixed\\sI\\w+\\sR\\w+"
Bench_rate_reg <- "Benchmark\\sR\\w+\\sthat"
# create a df that only includes the rows which match the above RegEx
Off_let <- Off_let_data %>% filter(page_id == 3, str_detect(Off_let_data$text, protec_per_reg)|
str_detect(Off_let_data$text, Arr_Fee_reg) | str_detect(Off_let_data$text, Fix_inter_reg) |
str_detect(Off_let_data$text, Bench_rate_reg))
# Now only extract the numbers from the above DF
off_let_num <- str_extract(Off_let$text, "\\d+\\.?\\d+")
# The first element is always a NA value - based on the structure of these PDF files
# replace the first element of this character vector with the below
off_let_num[is.na(off_let_num)] <- str_extract(Off_let$text, "\\d+%")[[1]]
off_let_num
Can someone please help me turning this into a function. Thanks
Something like this?
What should be the inputs/outputs of the function? For now, the function only takes a data.frame as only argument, but you could extend it, so you can pass different regular expressions, or define the page_id for example.
library("textreadr")
library("pdftools")
library("tidyverse")
library("tidytext")
library("textreadr")
library("tm")
# Create Data frame
Off_let_data <- data.frame(page_id = c(3,3,3,3,3), element_id = c(19, 22, 26, 31, 31),
text = c("The Protected Percentage of your property value thats has been chosen is 0%",
"The Arrangement Fee payable at complettion: £50.00",
"The Fixed Interest Rate that is applied for the life of the period is: 5.40%",
"The Benchmark rate that will be used to calculate any early repayment 2.08%",
"The property value used in this scenario is 275,000.00"))
dummyFunc <- function(df) {
# read in the first element of a list of pdf file from a folder
files <- list.files(pattern = "pdf$")[1]
# extract the account number from the first pdf file
acc_num <- str_extract(files, "^\\d+")
# The RegEx's used to extract the relevant information
protec_per_reg <- "Protected\\sP\\w+\\sof"
Arr_Fee_reg <- "^The\\sArrangement\\sF\\w+"
Fix_inter_reg <- "Fixed\\sI\\w+\\sR\\w+"
Bench_rate_reg <- "Benchmark\\sR\\w+\\sthat"
# create a df that only includes the rows which match the above RegEx
Off_let <- df %>% filter(page_id == 3, str_detect(df$text, protec_per_reg)|
str_detect(df$text, Arr_Fee_reg) | str_detect(df$text, Fix_inter_reg) |
str_detect(df$text, Bench_rate_reg))
# Now only extract the numbers from the above DF
off_let_num <- str_extract(Off_let$text, "\\d+\\.?\\d+")
# The first element is always a NA value - based on the structure of these PDF files
# replace the first element of this character vector with the below
off_let_num[is.na(off_let_num)] <- str_extract(Off_let$text, "\\d+%")[[1]]
return(off_let_num)
}
dummyFunc(Off_let_data)
And for a more extended version of the function:
# The RegEx's used to extract the relevant information
protec_per_reg <- "Protected\\sP\\w+\\sof"
Arr_Fee_reg <- "^The\\sArrangement\\sF\\w+"
Fix_inter_reg <- "Fixed\\sI\\w+\\sR\\w+"
Bench_rate_reg <- "Benchmark\\sR\\w+\\sthat"
regexprlist <- list(protec_per_reg, Arr_Fee_reg,
Fix_inter_reg, Bench_rate_reg)
dummyFuncExt <- function(df, regexp, page_id) {
# read in the first element of a list of pdf file from a folder
files <- list.files(pattern = "pdf$")[1]
# extract the account number from the first pdf file
acc_num <- str_extract(files, "^\\d+")
# create a df that only includes the rows which match the above RegEx
Off_let <- df %>% filter(page_id == page_id, str_detect(df$text, regexprlist[[1]])|
str_detect(df$text, regexprlist[[2]]) | str_detect(df$text, regexprlist[[3]]) |
str_detect(df$text, regexprlist[[4]]))
# Now only extract the numbers from the above DF
off_let_num <- str_extract(Off_let$text, "\\d+\\.?\\d+")
# The first element is always a NA value - based on the structure of these PDF files
# replace the first element of this character vector with the below
off_let_num[is.na(off_let_num)] <- str_extract(Off_let$text, "\\d+%")[[1]]
return(off_let_num)
}
dummyFuncExt(df = Off_let_data, regexp = regexprlist, page_id = 3)
Related
I am trying to find a more efficient way to import a list of data files with a kind of awkward structure. The files are generated by a software program that looks like it was intended to be printed and viewed rather than exported and used. The file contains a list of "Compounds" and then some associated data. Following a line reading "Compound X: XXXX", there are a lines of tab delimited data. Within each file the number of rows for each compound remains constant, but the number of rows may change with different files.
Here is some example data:
#Generate two data files to be imported
cat("Quantify Compound Summary Report\n",
"\nPrinted Mon March 28 14:54:39 2022\n",
"\nCompound 1: One\n",
"\tName\tID\tResult",
"\n1\tA1234\tQC\t25.2",
"\n2\tA4567\tQC\t26.8\n",
"\nCompound 2: Two\n",
"\tName\tID\tResult",
"\n1\tA1234\tQC\t51.1",
"\n2\tA4567\tQC\t48.6\n",
file = "test1.txt")
cat("Quantify Compound Summary Report\n",
"\nPrinted Mon March 28 14:54:39 2022\n",
"\nCompound 1: One\n",
"\tName\tID\tResult",
"\n1\tC1234\tQC\t25.2",
"\n2\tC4567\tQC\t26.8",
"\n3\tC8910\tQC\t25.4\n",
"\nCompound 2: Two\n",
"\tName\tID\tResult",
"\n1\tC1234\tQC\t51.1",
"\n2\tC4567\tQC\t48.6",
"\n3\tC8910\tQC\t45.6\n",
file = "test2.txt")
What I want in the end is a list of data frames, one for each "Compound", containing all rows of data associated with each compound. To get there, I have a fairly convoluted approach of smashed together functions which give me what I want but in a very unruly fashion.
library(tidyverse)
## Step 1: ID list of data files
data.files <- list.files(path = ".",
pattern = ".txt",
full.names = TRUE)
## Step 2: Read in the data files
data.list.raw <- lapply(data.files, read_lines, skip = 4)
## Step 3: Identify the "compounds" in the data file output
Hdr.dat <- lapply(data.list.raw, function(x) grepl("Compound", x)) # Scan the file and find the different compounds within it (this can be applied to any Waters output)
grp.dat <- Map(function(x, y) {x[y][cumsum(y)]}, data.list.raw, Hdr.dat)
## Step 4: Unpack the tab delimited parts of the export file, then generate a list of dataframes within a list of imported files
Read <- function(x) read.table(text = x, sep = "\t", fill = TRUE, stringsAsFactors = FALSE)
raw.dat <- Map(function(x,y) {Map(Read, split(x, y))}, data.list.raw, grp.dat)
## Step 5: Curate the list of compounds - remove "Compound X: "
cmpd.list <- lapply(raw.dat, function(x) trimws(substring(names(x), 13)))
## Step 6: Rename the headers for the dataframes, remove the blank rows and recentre
NameCols <- function(z) lapply(names(z), function(i){
x <- z[[ i ]]
colnames(x) <- x[2,]
x[c(-1,-2),]
})
data.list <- Map(function(x,y){setNames(NameCols(x), y)}, raw.dat, cmpd.list)
## Step 7: rbind the data based on the compound
cmpd_names <- unique(unlist(sapply(data.list, names)))
result <- list()
j <- for (n in cmpd_names) {
result[[n]] <- map(data.list, n)
}
list.merged <- map(result, dplyr::bind_rows)
list.merged <- lapply(list.merged, function(x) x %>% filter(Name != ""))
The challenge here is script efficiency as far as time (I can import hundreds or thousands of data files with hundreds of lines of data, which can take quite a while) as well as general "cleanliness", which is why I included tidyverse as a tag here. I also want this to be highly generalizable, as the "Compounds" may change over time. If someone can come up with a clean and efficient way to do all of this I would be forever in your debt.
See one approach below. The whole pipeline might be intimidating at first glance. You can insert a head (or tail) call after each step (%>%) to display the current stage of data transformation. There's a bit of cleanup with regular expressions going on in the gsubs: modify as desired.
intermediate_result <-
data.frame(file_name = c('test1.txt','test2.txt')) %>%
rowwise %>%
## read file content into a raw string:
mutate(raw = read_file(file_name)) %>%
## separate raw file contents into rows
## using newline and carriage return as row delimiters:
separate_rows(raw, sep = '[\\n\\r]') %>%
## provide a compound column for later grouping
## by extracting the 'Compound' string from column raw
## or setting the compound column to NA otherwise:
mutate(compound = ifelse(grepl('^Compound',raw),
gsub('.*(Compound .*):.*','\\1', raw),
NA)
) %>%
## remove rows with empty raw text:
filter(raw != '') %>%
## filling missing compound values (NAs) with last non-NA compound string:
fill(compound, .direction = 'down') %>%
## keep only rows with tab-separated raw string
## indicating tabular data
filter(grepl('\\t',raw)) %>%
## insert a column header 'Index' because
## original format has four data columns but only three header cols:
mutate(raw = gsub(' *\\tName','Index\tName',raw))
Above steps result in a dataframe with a column 'raw' containing the cleaned-up data as string suited for conversion into tabular data (tab-delimited, linefeeds).
From there on, we can either proceed by keeping and householding the future single tables inside the parent table as a so-called list column (Variant A) or proceed with splitting column 'raw' and mapping it (Variant B, credits to #Dorton).
Variant A produces a column of dataframes inside the dataframe:
intermediate_result %>%
group_by(compound) %>%
## the nifty piece: you can store dataframes inside a dataframe:
mutate(
tables = list(read.table(text = raw, header = TRUE, sep = '\t' ))
)
Variant B produces a list of dataframes named with the corresponding compound:
intermediate_result %>%
split(f = as.factor(.$compound)) %>%
lapply(function(x) x %>%
separate(raw,
into = unlist(
str_split(x$raw[1], pattern = "\t"))
)
)
I have a script that works fine for one file. It takes the information from a json file, extracts a list and a sublist of it (A), and then another list B with the third element of list A. It creates a data frame with list B and compares it with a master file. Finally, it provides two numbers: the number of elements in the list B and the number of matching elements of that list when comparing with the master file.
However, I have 180 different json files in a folder and I need to run the script for all of them, and build a data frame with the results for each file. So the final result should be something like this (note that the last line's figures are correct, the first two are fictitious):
The code I have so far is the following:
library(rjson)
library(dplyr)
library(tidyverse)
#load data from file
file <- "./raw_data/whf.json"
json_data <- fromJSON(file = file)
org_name <- json_data$id
# extract lists and the sublist
usernames <- json_data$twitter
following <- usernames$following
# create empty vector to populate
longitud = length(following)
names <- vector(length = longitud)
# loop to populate the empty vector with third element of the sub-list
for(i in 1:longitud){
names[i] <- following[[i]][3]
}
# create a data frame and change column name
names_list <- data.frame(sapply(names, c))
colnames(names_list) <- "usernames"
# create a data frame with the correct formatting ready to comparison
org_handles <- data.frame(paste("#", names_list$usernames, sep=""))
colnames(org_handles) <- "Twitter"
# load master file and select the needed columns
psa_handles <- read_csv(file = "./raw_data/psa_handles.csv") %>%
select(Name, AKA, Twitter)
# merge data frames and present the results
org_list <- inner_join(psa_handles, org_handles)
length(org_list$Twitter)
length(usernames$following)
My first attempt is to include this code at the beginning:
files <- list.files()
for(f in files){
json_data <- fromJSON(file = f)
# the rest of the script for one file here
}
but I do not know how to write the code for the data frame or even how to integrate both ideas -the working script and the loop for the file names. I took the idea from here.
The new code after Alvaro Morales' answer is the following
library(rjson)
library(dplyr)
library(tidyverse)
archivos <- list.files("./raw_data/")
calculate_accounts <- function(archivos){
#load data from file
path <- paste("./raw_data/", archivos, sep = "")
json_data <- fromJSON(file = path)
org_name <- json_data$id
# extract lists and the sublist
usernames <- json_data$twitter
following <- usernames$following
# create empty vector to populate
longitud = length(following)
names <- vector(length = longitud)
# loop to populate the empty vector with third element of the sub-list
for(i in 1:longitud){
names[i] <- following[[i]][3]
}
# create a data frame and change column name
names_list <- data.frame(sapply(names, c))
colnames(names_list) <- "usernames"
# create a data frame with the correct formatting ready to comparison
org_handles <- data.frame(paste("#", names_list$usernames, sep=""))
colnames(org_handles) <- "Twitter"
# load master file and select the needed columns
psa_handles <- read_csv(file = "./psa_handles.csv") %>%
select(Name, AKA, Twitter)
# merge data frames and present the results
org_list <- inner_join(psa_handles, org_handles)
accounts_db_org <- length(org_list$Twitter)
accounts_total_org <- length(usernames$following)
}
table_psa <- map_dfr(archivos, calculate_accounts)
However, now there is an error when Joining, by = "Twitter", it says subindex out of limits.
Links to 3 test files to put together in raw_data folder:
https://drive.google.com/file/d/1ilUHwLjgtZCzh0LneIJEhTryrGumDF1V/view?usp=sharing
https://drive.google.com/file/d/1KM3hRZ8DzgPMEsMFmwBdmMNHrPCttuaB/view?usp=sharing
https://drive.google.com/file/d/17cWXJ9ltGXZ6izkgJv0uyNwStrE95_OA/view?usp=sharing
Link to the master file to compare:
https://drive.google.com/file/d/11fOpYFFfHijhZl_CuWHKvkrI7edkpUNQ/view?usp=sharing
<<<<< UPDATE >>>>>>
I am trying to find the solution and I did the code work and provide a valide output (a 180x3 data frame), but the columns that should be filled with the values of the objects accounts_db_org and accounts_total_org are showing NA. When checking the value stored in those objects, the values are correct (for the last iteration). So the output now is in its right format, but with NA instead of numbers.
I am really close, but I am not being able to make the code to show the right numbers. My last attempt is:
library(rjson)
library(dplyr)
library(tidyverse)
archivos <- list.files("./raw_data", pattern = "json", full.names = TRUE)
psa_handles <- read_csv(file = "./raw_data/psa_handles.csv", show_col_types = FALSE) %>%
select(Name, AKA, Twitter)
nr_archivos <- length(archivos)
psa_result <- matrix(nrow = nr_archivos, ncol = 3)
# loop for working with all files, one by one
for(f in 1:nr_archivos){
# load file
json_data <- fromJSON(file = archivos[f])
org_name <- json_data$id
# extract lists and the sublist
usernames <- json_data$twitter
following <- usernames$following
# empty vector
longitud = length(following)
names <- vector(length = longitud)
# loop to populate with the third element of each i item of the sublist
for(i in 1:longitud){
names[i] <- following[[i]][3]
}
# convert the list into a data frame
names_list <- data.frame(sapply(names, c))
colnames(names_list) <- "usernames"
# applying some format prior to comparison
org_handles <- data.frame(paste("#", names_list$usernames, sep=""))
colnames(org_handles) <- "Twitter"
# merge tables and calculate the results for each iteration
org_list <- inner_join(psa_handles, org_handles)
accounts_db_org <- length(org_list$Twitter)
accounts_total_org <- length(usernames$following)
# populate the matrix row by row
psa_result[f] <- c(org_name, accounts_db_org, accounts_total_org)
}
# create a data frame from the matrix and save the result
psa_result <- data.frame(psa_result)
write_csv(psa_result, file = "./outputs/cuentas_seguidas_en_psa.csv")
The subscript out of bounds error was caused by a json file with 0 records. That was fixed deleting the file.
You can do it with purrr::map or purrr::map_dfr.
Is this what you looking for?
archivos <- list.files("./raw_data", pattern = "json", full.names = TRUE)
# load master file and select the needed columns. This needs to be out of "calculate_accounts" because you only read it once.
psa_handles <- read_csv(file = "./raw_data/psa_handles.csv") %>%
select(Name, AKA, Twitter)
# calculate accounts
calculate_accounts <- function(archivo){
json_data <- rjson::fromJSON(file = archivo)
org_handles <- json_data %>%
pluck("twitter", "following") %>%
map_chr("username") %>%
as_tibble() %>%
rename(usernames = value) %>%
mutate(Twitter = str_c("#", usernames)) %>%
select(Twitter)
org_list <- inner_join(psa_handles, org_handles)
org_list %>%
mutate(accounts_db_org = length(Twitter),
accounts_total_org = nrow(org_handles)) %>%
select(-Twitter)
}
table_psa <- map_dfr(archivos, calculate_accounts)
#output:
# A tibble: 53 x 4
Name AKA accounts_db_org accounts_total_org
<chr> <chr> <int> <int>
1 Association of American Medical Colleges AAMC 20 2924
2 American College of Cardiology ACC 20 2924
3 American Heart Association AHA 20 2924
4 British Association of Dermatologists BAD 20 2924
5 Canadian Psoriasis Network CPN 20 2924
6 Canadian Skin Patient Alliance CSPA 20 2924
7 European Academy of Dermatology and Venereology EADV 20 2924
8 European Society for Dermatological Research ESDR 20 2924
9 US Department of Health and Human Service HHS 20 2924
10 International Alliance of Dermatology Patients Organisations (Global Skin) IADPO 20 2924
# ... with 43 more rows
Unfortunately, the answer provided by Álvaro does not work as expected, since the output repeats the same number with different organisation names, making it really difficult to read. Actually, the number 20 is repeated 20 times, the number 11, 11 times, and so on. The information is there, but it is not accessible without further data treatment.
I was doing my own research in the meantime and I got to the following code. Finally I made it to work, but the data format was "matrix" "array", really confusing. Fortunately, I wrote the last lines to transpose the data, unlist the array and convert in a matrix, which is able to be converted in a data frame and manipulated as usual.
Maybe my explanation is not very useful, and since I am a newbie, I am sure the code is far from being elegant and optimised. Anyway, please review the code below:
library(purrr)
library(rjson)
library(dplyr)
library(tidyverse)
setwd("~/documentos/varios/proyectos/programacion/R/psa_twitter")
# Load data from files.
archivos <- list.files("./raw_data/json_files",
pattern = ".json",
full.names = TRUE)
psa_handles <- read_csv(file = "./raw_data/psa_handles.csv") %>%
select(Name, AKA, Twitter)
nr_archivos <- length(archivos)
calcula_cuentas <- function(a){
# Extract lists
json_data <- fromJSON(file = a)
org_aka <- json_data$id
org_meta <- json_data$metadata
org_name <- org_meta$company
twitter <- json_data$twitter
following <- twitter$following
# create an empty vector to populate
longitud = length(following)
names <- vector(length = longitud)
# loop to populate the empty vector with third element of the sub-list
for(i in 1:longitud){
names[i] <- following[[i]][3]
}
# create a data frame and change column name
names_list <- data.frame(sapply(names, c))
colnames(names_list) <- "usernames"
# Create a data frame with the correct formatting ready to comparison
org_handles <- data.frame(paste("#",
names_list$usernames,
sep="")
)
colnames(org_handles) <- "Twitter"
# merge tables
org_list <- inner_join(psa_handles, org_handles)
cuentas_db_org <- length(org_list$Twitter)
cuentas_total_org <- length(twitter$following)
results <- data.frame(Name = org_name,
AKA = org_aka,
Cuentas_db = cuentas_db_org,
Total = cuentas_total_org)
results
}
# apply function to list of files and unlist the result
psa <- sapply(archivos, calcula_cuentas)
psa1 <- t(as.data.frame(psa))
psa2 <- matrix(unlist(psa1), ncol = 4) %>%
as.data.frame()
colnames(psa2) <- c("Name", "AKA", "tw_int_outbound", "tw_ext_outbound")
# Save the results.
saveRDS(psa2, file = "rda/psa.RDS")
I'm really struggling to extract the proper information from several thousands PDF files from NTSB (some Dates and numbers to be specific); these PDFs don't require to be OCRed and each report is almost identical in length and layout information.
I need to extract the date and the time of the accident (first page) and some other information, like Pilot's age or its Flight experience. What I tried does the job for several files but is not working for each file the since code I am using is poorly written.
# an example with a single file
library(pdftools)
library(readr)
# Download the file and read it row by row
file <- 'http://data.ntsb.gov/carol-repgen/api/Aviation/ReportMain/GenerateNewestReport/89789/pdf' # less than 100 kb
destfile <- paste0(getwd(),"/example.pdf")
download.file(file, destfile)
pdf <- pdf_text(destfile)
rows <-scan(textConnection(pdf),
what="character", sep = "\n")
# Extract the date of the accident based on the 'Date & Time' occurrence.
date <-rows[grep(pattern = 'Date & Time', x = rows, ignore.case = T, value = F)]
date <- strsplit(date, " ")
date[[1]][9] #this method is not desirable since the date will not be always in that position
# Pilot age
age <- rows[grep(pattern = 'Age', x = rows, ignore.case = F, value = F)]
age <- strsplit(age, split = ' ')
age <- age[[1]][length(age[[1]])] # again, I'm using the exact position in that list
age <- readr::parse_number(age) #
The main issue I got is when I am trying to extract the date and time of the accident. Is it possible to extract that exact piece of information by avoiding using a list as I did here?
I think the best approach to achieve what you want is to use regex.
In this case I use stringr library. The main idea with regex is to find
the desire string pattern, in this case is the date 'July 29, 2014, 11:15'
Take on count that you'll have to check the date format for each pdf file
library(pdftools)
library(readr)
library(stringr)
# Download the file and read it row by row
file <- 'http://data.ntsb.gov/carol-repgen/api/Aviation/ReportMain/GenerateNewestReport/89789/pdf' # less than 100 kb
destfile <- paste0(getwd(), "/example.pdf")
download.file(file, destfile)
pdf <- pdf_text(destfile)
## New code
# Regex pattern for date 'July 29, 2014, 11:15'
regex_pattern <- "[T|t]ime\\:(.*\\d{2}\\:\\d{2})"
# Getting date from page 1
grouped_matched <- str_match_all(pdf[1], regex_pattern)
# This returns a list with groups. You're interested in group 2
raw_date <- grouped_matched[[1]][2] # First element, second group
# Clean date
date <- trimws(raw_date)
# Using dplyr
library(dplyr)
date <- pdf[1] %>%
str_match_all(regex_pattern) %>%
.[[1]] %>% # First list element
.[2] %>% # Second group
trimws() # Remove extra white spaces
You can make a function to extract the date changing the regex pattern for different files
Regards
I've got a function which I'm trying to apply in a for loop that extracts a dataframe from multiple files and combines them into a single one.
This is how, from what I've read, I thought would be the best way to attack it but I get an empty list returned, when I was hoping for a list of dataframes which could be combined using bind_rows.
This is the code I'm using:
combined_functions <- function(file_name) {
#combines the get_dfm_df and get corp function: get dfm tibble straight from the file name
data_frame_returned<- get_dfm_df(getcorp(file_name))
data_frame_returned
}
list_of_dataframes <- list()
file.list <- dir(pattern ="DOCX$")
for (file in file.list) {
dataframe_of_file <- combined_function(file)
append(list_of_dataframes,dataframe_of_file)
}
bind_rows(list_of_dataframes, .id = "column_label") #https://stackoverflow.com/questions/2851327/convert-a-list-of-data-frames-into-one-data-frame
It creates an empty list, gets a list of the file names which the function combined_function uses to create a data frame out of the file and should, to my understanding, append this dataframe to the list. After all the files in the directory have been matched, bind_rows should combine it into one overall dataframe but it only returns an empty tibble. list_of_dataframes is also empty.
I've tried the solution in this answer but it didn't help:
Append a data frame to a list
https://www.dropbox.com/sh/z8vh50b370gcb1j/AAAcbnfAUOM6-y8uWn4-lUWLa?dl=0
This a link to the raw files I am using in this case, but I think the problem is a general one.
Appendix:
These are the functions combined_functions refer too. They work on the individual cases so I'm confident this is not the cause of the problem but I've included them for completeness anyway.
rm(list = ls())
library(quanteda)
library(quanteda.corpora)
library(readtext)
library(LexisNexisTools)
library(tidyverse)
library(tools)
getcorp<- function(file_name){
#function to take the lexis word document, convert it into quanteda corpus object, returns duplicate df and date from filename in list
LNToutput <- lnt_read(file_name)
duplicates_df <- lnt_similarity(LNToutput = LNToutput,
threshold = 0.99)
duplicates_df <- duplicates_df[duplicates_df$Similarity > 0.99] #https://github.com/JBGruber/LexisNexisTools creates dataframe of duplicate articles
LNToutput <- LNToutput[!LNToutput#meta$ID %in% duplicates_df$ID_duplicate, ] #removes these duplicates from the main dataframe
corp <- lnt_convert(LNToutput, to = "quanteda") #to return multiple values from the r function, must be placed in a list
corp_date_from_file_name <- basename(file_name)
file_date <- as.Date(corp_date_from_file_name, format ="%d_%m_%y")
list_of_returns <-list(duplicates_df, corp,file_date) #list returns has duplicate df in first position, corpus in second and the file date in third
list_of_returns
}
get_dfm_df <- function(corp_list){
# takes the corp from getcorp, applies lexicoder dictionary, adds the neg_pos etc to their equivalent columns,
# calculates the percentage each category is of the total number of sentiment bearing words, adds the date specified from the file name
corpus_we_want <- corp_list[[2]]
sentiment_df <- dfm(corpus_we_want, dictionary = data_dictionary_LSD2015) %>% #applies the dictionary
convert("data.frame") %>%
cbind(docvars(corpus_we_want)) %>% #https://stackoverflow.com/questions/60419692/how-to-convert-dfm-into-dataframe-but-keeping-docvars
as_tibble() %>%
mutate(combined_negative = negative + neg_positive, combined_positive = positive + neg_negative) %>%
mutate(pos_percentage = combined_positive/(combined_positive + combined_negative ), neg_percentage =combined_negative/(combined_positive + combined_negative ) ) %>%
mutate(date = corp_list[[3]])
sentiment_df
}
I want to create variable names on the fly inside a list and assign them values in R, but I am unable to get the desired result. Here is the logic of my code:
Upon the function call: dat_in <- readf(1,2), an input file is read based on a product and site. After reading, a particular column (13th, here) is assigned to a variable aot500. I want to have this variable return from the function for each combination of product and site. For example, I need variables name in the list statement as aot500.AF, aot500.CM, aot500.RB to be returned from this function. I am having trouble in the return statement. There is no error but there is nothing in dat_in. I expect it to have dat_in$aot500.AF etc. Please inform what is wrong in the return statement. Furthermore, I want to read files for all combinations in a single call to the function, say using a for loop and I wonder how would the return statement handle list of more variables.
prod <- c('inv','tot')
site <- c('AF','CM','RB')
readf <- function(pp, kk) {
fname.dsa <- paste("../data/site_data_",prod[pp],"/daily_",site[kk],".dat",sep="")
inp.aod <- read.csv(fname.dsa,skip=4,sep=",",stringsAsFactors=F,na.strings="N/A")
aot500 <- inp.aod[,13]
return(list(assign(paste("aot500",siteabbr[kk],sep="."),aot500)))
}
Almost always there is no need to use assign(), we can solve the problem in two steps, read the files into a list, then give names.
(Not tested as we don't have your files)
prod <- c('inv', 'tot')
site <- c('AF', 'CM', 'RB')
# get combo of site and prod
prod_site <- expand.grid(prod, site)
colnames(prod_site) <- c("prod", "site")
# Step 1: read the files into a list
res <- lapply(1:nrow(prod_site), function(i){
fname.dsa <- paste0("../data/site_data_",
prod_site[i, "prod"],
"/daily_",
prod_site[i, "site"],
".dat")
inp.aod <- read.csv(fname.dsa,
skip = 4,
stringsAsFactors = FALSE,
na.strings = "N/A")
inp.aod[, 13]
})
# Step 2: assign names to a list
names(res) <- paste("aot500", prod_site$prod, prod_site$site, sep = ".")
I propose two answers, one based on dplyr and one based on base R.
You'll probably have to adapt the filename in the readAOT_500 function to your particular case.
Base R answer
#' Function that reads AOT_500 from the given product and site file
#' #param prodsite character vector containing 2 elements
#' name of a product and name of a site
readAOT_500 <- function(prodsite,
selectedcolumn = c("AOT_500"),
path = tempdir()){
cat(path, prodsite)
filename <- paste0(path, prodsite[1],
prodsite[2], ".csv")
dtf <- read.csv(filename, stringsAsFactors = FALSE)
dtf <- dtf[selectedcolumn]
dtf$prod <- prodsite[1]
dtf$site <- prodsite[2]
return(dtf)
}
# Load one file for example
readAOT_500(c("inv", "AF"))
listofsites <- list(c("inv","AF"),
c("tot","AF"),
c("inv", "CM"),
c( "tot", "CM"),
c("inv", "RB"),
c("tot", "RB"))
# Load all files in a list of data frames
prodsitedata <- lapply(listofsites, readAOT_500)
# Combine all data frames together
prodsitedata <- Reduce(rbind,prodsitedata)
dplyr answer
I use Hadley Wickham's packages to clean data.
library(dplyr)
library(tidyr)
daily_CM <- read.csv("~/downloads/daily_CM.dat",skip=4,sep=",",stringsAsFactors=F,na.strings="N/A")
# Generate all combinations of product and site.
prodsite <- expand.grid(prod = c('inv','tot'),
site = c('AF','CM','RB')) %>%
# Group variables to use do() later on
group_by(prod, site)
Create 6 fake files by sampling from the data you provided
You can skip this section when you have real data.
I used various sample length so that the number of observations
differs for each site.
prodsite$samplelength <- sample(1:495,nrow(prodsite))
prodsite %>%
do(stuff = write.csv(sample_n(daily_CM,.$samplelength),
paste0(tempdir(),.$prod,.$site,".csv")))
Read many files using dplyr::do()
prodsitedata <- prodsite %>%
do(read.csv(paste0(tempdir(),.$prod,.$site,".csv"),
stringsAsFactors = FALSE))
# Select only the columns you are interested in
prodsitedata2 <- prodsitedata %>%
select(prod, site, AOT_500)