How to process many txt files with my code in R - r

I'm quite a novice, but I've successfully managed to make some code do what I want.
Right now my code does what I want for one file at a time.
I want to make my code automate this process for 600 files.
I kind of have an idea, that I need to put the list of files in a vector, then maybe use lapply and a function, but I'm not sure how to do this. The syntax and code are beyond me at the moment.
Here's my code...
#Packages are callled
library(tm) #text mining
library(SnowballC) #stemming - reducing words to their root
library(stringr) #for str_trim
library(plyr)
library(dplyr)
library(readtext)
#this is my code to run the code on a bunch of text files. Obviously it's unfinished, and I'm not sure if this is the right approach. Where do I put this? Will it even work?
data_files <- list.files(path = "data/", pattern = '*.txt', full.names = T, recursive = T)
lapply(
#
# where do I put this chunk of code?
# do I need to make all the code below a function?
##this bit cleans the document
company <- "CompanyXReport2015"
txt_raw = readLines("data/CompanyXReport2015.txt")
# remove all extra white space, also splits on lines
txt_format1 <- gsub(" *\\b[[:alpha:]]{1,2}\\b *", " ", txt_raw)
txt_format1.5 <- gsub("^ +| +$|( ) +", "\\1", txt_format1)
# recombine now that all white space is stripped
txt_format2 <- str_c(txt_format1.5, collapse=" ")
#split strings on space now to get a list of all words
txt_format3 <- str_split(txt_format2," ")
txt_format3
# convert to vector
txt_format4 <- unlist(txt_format3)
# remove empty strings and those with words shorter than 3 length
txt_format5 <- txt_format4[str_length(txt_format4) > 3]
# combine document back to single string
cleaned <- str_c(txt_format5, collapse=" ")
head(cleaned, 2)
##import key words and run analysis on frequency for the document
s1_raw = readLines("data/stage1r.txt")
str(s1_raw)
s2_raw = readLines("data/stage2r.txt")
str(s2_raw)
s3_raw = readLines("data/stage3r.txt")
str(s3_raw)
s4_raw = readLines("data/stage4r.txt")
str(s4_raw)
s5_raw = readLines("data/stage5r.txt")
str(s5_raw)
# str_count(cleaned, "legal")
# apply str_count function using each stage vector
level1 <- sapply(s1_raw, str_count, string=cleaned)
level2 <- sapply(s2_raw, str_count, string=cleaned)
level3 <- sapply(s3_raw, str_count, string=cleaned)
level4 <- sapply(s4_raw, str_count, string=cleaned)
level5 <- sapply(s5_raw, str_count, string=cleaned)
#make a vector from this for the report later
wordcountresult <- c(level1,level2,level3,level4,level5)
# convert to dataframes
s1 <- as.data.frame(level1)
s2 <- as.data.frame(level2)
s3 <- as.data.frame(level3)
s4 <- as.data.frame(level4)
s5 <- as.data.frame(level5)
# add a count column that each df shares
s1$count <- s1$level1
s2$count <- s2$level2
s3$count <- s3$level3
s4$count <- s4$level4
s5$count <- s5$level5
# add a stage column to identify what stage the word is in
s1$stage <- "Stage 1"
s2$stage <- "Stage 2"
s3$stage <- "Stage 3"
s4$stage <- "Stage 4"
s5$stage <- "Stage 5"
# drop the unique column
s1 <- s1[c("count","stage")]
s2 <- s2[c("count","stage")]
s3 <- s3[c("count","stage")]
s4 <- s4[c("count","stage")]
s5 <- s5[c("count","stage")]
# s1
df <- rbind(s1, s2,s3, s4, s5)
df
#write the summary for each company to a csv
#Making the report
#Make a vector to put in the report
#get stage counts and make a vector
s1c <- sum(s1$count)
s2c <- sum(s2$count)
s3c <- sum(s3$count)
s4c <- sum(s4$count)
s5c <- sum(s5$count)
stagesvec <- c(s1c,s2c,s3c,s4c,s5c)
names(stagesvec) <- c("Stage1","Stage2","Stage3","Stage4","Stage5")
#get the company report name for a vector
companyvec <- c(company)
names(companyvec) <- c("company")
# combine the vectors for the vector row to be inserted into the report
reportresult <- c(companyvec, wordcountresult, stagesvec)
rrdf <- data.frame(t(reportresult))
newdf <- data.frame(t(reportresult))
#if working file exists-use it
if (file.exists("data/WordCount12.csv")){
write.csv(
rrdf,
"data/WordCountTemp12.csv", row.names=FALSE
)
rrdf2 <-
read.csv("data/WordCountTemp12.csv")
df2 <-
read.csv("data/WordCount12.csv")
df2 <- rbind(df2, rrdf2)
write.csv(df2,
"data/WordCount12.csv", row.names=FALSE)
}else{ #if NO working file exists-make it
write.csv(newdf,
"data/WordCount12.csv", row.names=FALSE)
}

Hello :) Here is an example of workflow, you might find better ones but I started with it when learning.
listoftextfiles = list.files(...)
analysis1 = function(an element of listoftextfiles){
# your 1st analysis
}
res1 = lapply(listoftextfile, analysis1) # results of the 1st analysis
analysis2 = function(an element of res1){
# your 2nd analysis
}
res2 = lapply(res1, analysis2) # results of the 2nd analysis
# ect.
You will find many tutorials about custom functions on internet.

Related

Run a R script for all files in a directory, and store the outputs in one common data frame

I have a script that works fine for one file. It takes the information from a json file, extracts a list and a sublist of it (A), and then another list B with the third element of list A. It creates a data frame with list B and compares it with a master file. Finally, it provides two numbers: the number of elements in the list B and the number of matching elements of that list when comparing with the master file.
However, I have 180 different json files in a folder and I need to run the script for all of them, and build a data frame with the results for each file. So the final result should be something like this (note that the last line's figures are correct, the first two are fictitious):
The code I have so far is the following:
library(rjson)
library(dplyr)
library(tidyverse)
#load data from file
file <- "./raw_data/whf.json"
json_data <- fromJSON(file = file)
org_name <- json_data$id
# extract lists and the sublist
usernames <- json_data$twitter
following <- usernames$following
# create empty vector to populate
longitud = length(following)
names <- vector(length = longitud)
# loop to populate the empty vector with third element of the sub-list
for(i in 1:longitud){
names[i] <- following[[i]][3]
}
# create a data frame and change column name
names_list <- data.frame(sapply(names, c))
colnames(names_list) <- "usernames"
# create a data frame with the correct formatting ready to comparison
org_handles <- data.frame(paste("#", names_list$usernames, sep=""))
colnames(org_handles) <- "Twitter"
# load master file and select the needed columns
psa_handles <- read_csv(file = "./raw_data/psa_handles.csv") %>%
select(Name, AKA, Twitter)
# merge data frames and present the results
org_list <- inner_join(psa_handles, org_handles)
length(org_list$Twitter)
length(usernames$following)
My first attempt is to include this code at the beginning:
files <- list.files()
for(f in files){
json_data <- fromJSON(file = f)
# the rest of the script for one file here
}
but I do not know how to write the code for the data frame or even how to integrate both ideas -the working script and the loop for the file names. I took the idea from here.
The new code after Alvaro Morales' answer is the following
library(rjson)
library(dplyr)
library(tidyverse)
archivos <- list.files("./raw_data/")
calculate_accounts <- function(archivos){
#load data from file
path <- paste("./raw_data/", archivos, sep = "")
json_data <- fromJSON(file = path)
org_name <- json_data$id
# extract lists and the sublist
usernames <- json_data$twitter
following <- usernames$following
# create empty vector to populate
longitud = length(following)
names <- vector(length = longitud)
# loop to populate the empty vector with third element of the sub-list
for(i in 1:longitud){
names[i] <- following[[i]][3]
}
# create a data frame and change column name
names_list <- data.frame(sapply(names, c))
colnames(names_list) <- "usernames"
# create a data frame with the correct formatting ready to comparison
org_handles <- data.frame(paste("#", names_list$usernames, sep=""))
colnames(org_handles) <- "Twitter"
# load master file and select the needed columns
psa_handles <- read_csv(file = "./psa_handles.csv") %>%
select(Name, AKA, Twitter)
# merge data frames and present the results
org_list <- inner_join(psa_handles, org_handles)
accounts_db_org <- length(org_list$Twitter)
accounts_total_org <- length(usernames$following)
}
table_psa <- map_dfr(archivos, calculate_accounts)
However, now there is an error when Joining, by = "Twitter", it says subindex out of limits.
Links to 3 test files to put together in raw_data folder:
https://drive.google.com/file/d/1ilUHwLjgtZCzh0LneIJEhTryrGumDF1V/view?usp=sharing
https://drive.google.com/file/d/1KM3hRZ8DzgPMEsMFmwBdmMNHrPCttuaB/view?usp=sharing
https://drive.google.com/file/d/17cWXJ9ltGXZ6izkgJv0uyNwStrE95_OA/view?usp=sharing
Link to the master file to compare:
https://drive.google.com/file/d/11fOpYFFfHijhZl_CuWHKvkrI7edkpUNQ/view?usp=sharing
<<<<< UPDATE >>>>>>
I am trying to find the solution and I did the code work and provide a valide output (a 180x3 data frame), but the columns that should be filled with the values of the objects accounts_db_org and accounts_total_org are showing NA. When checking the value stored in those objects, the values are correct (for the last iteration). So the output now is in its right format, but with NA instead of numbers.
I am really close, but I am not being able to make the code to show the right numbers. My last attempt is:
library(rjson)
library(dplyr)
library(tidyverse)
archivos <- list.files("./raw_data", pattern = "json", full.names = TRUE)
psa_handles <- read_csv(file = "./raw_data/psa_handles.csv", show_col_types = FALSE) %>%
select(Name, AKA, Twitter)
nr_archivos <- length(archivos)
psa_result <- matrix(nrow = nr_archivos, ncol = 3)
# loop for working with all files, one by one
for(f in 1:nr_archivos){
# load file
json_data <- fromJSON(file = archivos[f])
org_name <- json_data$id
# extract lists and the sublist
usernames <- json_data$twitter
following <- usernames$following
# empty vector
longitud = length(following)
names <- vector(length = longitud)
# loop to populate with the third element of each i item of the sublist
for(i in 1:longitud){
names[i] <- following[[i]][3]
}
# convert the list into a data frame
names_list <- data.frame(sapply(names, c))
colnames(names_list) <- "usernames"
# applying some format prior to comparison
org_handles <- data.frame(paste("#", names_list$usernames, sep=""))
colnames(org_handles) <- "Twitter"
# merge tables and calculate the results for each iteration
org_list <- inner_join(psa_handles, org_handles)
accounts_db_org <- length(org_list$Twitter)
accounts_total_org <- length(usernames$following)
# populate the matrix row by row
psa_result[f] <- c(org_name, accounts_db_org, accounts_total_org)
}
# create a data frame from the matrix and save the result
psa_result <- data.frame(psa_result)
write_csv(psa_result, file = "./outputs/cuentas_seguidas_en_psa.csv")
The subscript out of bounds error was caused by a json file with 0 records. That was fixed deleting the file.
You can do it with purrr::map or purrr::map_dfr.
Is this what you looking for?
archivos <- list.files("./raw_data", pattern = "json", full.names = TRUE)
# load master file and select the needed columns. This needs to be out of "calculate_accounts" because you only read it once.
psa_handles <- read_csv(file = "./raw_data/psa_handles.csv") %>%
select(Name, AKA, Twitter)
# calculate accounts
calculate_accounts <- function(archivo){
json_data <- rjson::fromJSON(file = archivo)
org_handles <- json_data %>%
pluck("twitter", "following") %>%
map_chr("username") %>%
as_tibble() %>%
rename(usernames = value) %>%
mutate(Twitter = str_c("#", usernames)) %>%
select(Twitter)
org_list <- inner_join(psa_handles, org_handles)
org_list %>%
mutate(accounts_db_org = length(Twitter),
accounts_total_org = nrow(org_handles)) %>%
select(-Twitter)
}
table_psa <- map_dfr(archivos, calculate_accounts)
#output:
# A tibble: 53 x 4
Name AKA accounts_db_org accounts_total_org
<chr> <chr> <int> <int>
1 Association of American Medical Colleges AAMC 20 2924
2 American College of Cardiology ACC 20 2924
3 American Heart Association AHA 20 2924
4 British Association of Dermatologists BAD 20 2924
5 Canadian Psoriasis Network CPN 20 2924
6 Canadian Skin Patient Alliance CSPA 20 2924
7 European Academy of Dermatology and Venereology EADV 20 2924
8 European Society for Dermatological Research ESDR 20 2924
9 US Department of Health and Human Service HHS 20 2924
10 International Alliance of Dermatology Patients Organisations (Global Skin) IADPO 20 2924
# ... with 43 more rows
Unfortunately, the answer provided by Álvaro does not work as expected, since the output repeats the same number with different organisation names, making it really difficult to read. Actually, the number 20 is repeated 20 times, the number 11, 11 times, and so on. The information is there, but it is not accessible without further data treatment.
I was doing my own research in the meantime and I got to the following code. Finally I made it to work, but the data format was "matrix" "array", really confusing. Fortunately, I wrote the last lines to transpose the data, unlist the array and convert in a matrix, which is able to be converted in a data frame and manipulated as usual.
Maybe my explanation is not very useful, and since I am a newbie, I am sure the code is far from being elegant and optimised. Anyway, please review the code below:
library(purrr)
library(rjson)
library(dplyr)
library(tidyverse)
setwd("~/documentos/varios/proyectos/programacion/R/psa_twitter")
# Load data from files.
archivos <- list.files("./raw_data/json_files",
pattern = ".json",
full.names = TRUE)
psa_handles <- read_csv(file = "./raw_data/psa_handles.csv") %>%
select(Name, AKA, Twitter)
nr_archivos <- length(archivos)
calcula_cuentas <- function(a){
# Extract lists
json_data <- fromJSON(file = a)
org_aka <- json_data$id
org_meta <- json_data$metadata
org_name <- org_meta$company
twitter <- json_data$twitter
following <- twitter$following
# create an empty vector to populate
longitud = length(following)
names <- vector(length = longitud)
# loop to populate the empty vector with third element of the sub-list
for(i in 1:longitud){
names[i] <- following[[i]][3]
}
# create a data frame and change column name
names_list <- data.frame(sapply(names, c))
colnames(names_list) <- "usernames"
# Create a data frame with the correct formatting ready to comparison
org_handles <- data.frame(paste("#",
names_list$usernames,
sep="")
)
colnames(org_handles) <- "Twitter"
# merge tables
org_list <- inner_join(psa_handles, org_handles)
cuentas_db_org <- length(org_list$Twitter)
cuentas_total_org <- length(twitter$following)
results <- data.frame(Name = org_name,
AKA = org_aka,
Cuentas_db = cuentas_db_org,
Total = cuentas_total_org)
results
}
# apply function to list of files and unlist the result
psa <- sapply(archivos, calcula_cuentas)
psa1 <- t(as.data.frame(psa))
psa2 <- matrix(unlist(psa1), ncol = 4) %>%
as.data.frame()
colnames(psa2) <- c("Name", "AKA", "tw_int_outbound", "tw_ext_outbound")
# Save the results.
saveRDS(psa2, file = "rda/psa.RDS")

How can I delete a row containing a specific string in R?

I am new to using R. I am using a data set and the missing values have been replaced with "?" before I get the data. I am looking for a way to delete the rows that contain this. It isn't specific to just one row it is in all of them.
I have tried Delete rows containing specific strings in R but it isn't working for me. I have included my code so far below.
library(randomForest)
heart <- read.csv(url('http://archive.ics.uci.edu/ml/machine-learning-databases/echocardiogram/echocardiogram.data'))
names <- names(heart)
nrow(heart)
ncol(heart)
names(heart)
colnames(heart)[colnames(heart)=="X11"] <- "survival"
colnames(heart)[colnames(heart)=="X0"] <- "alive"
colnames(heart)[colnames(heart)=="X71"] <- "attackAge"
colnames(heart)[colnames(heart)=="X0.1"] <- "pericardialEffusion"
colnames(heart)[colnames(heart)=="X0.260"] <- "fractionalShortening"
colnames(heart)[colnames(heart)=="X9"] <- "epss"
colnames(heart)[colnames(heart)=="X4.600"] <- "lvdd"
colnames(heart)[colnames(heart)=="X14"] <- "wallMotionScore"
colnames(heart)[colnames(heart)=="X1"] <- "wallMotionIndex"
colnames(heart)[colnames(heart)=="X1.1"] <- "mult"
colnames(heart)[colnames(heart)=="name"] <- "patientName"
colnames(heart)[colnames(heart)=="X1.2"] <- "group"
colnames(heart)[colnames(heart)=="X0.2"] <- "aliveAfterYear"
names(heart)
library(randomForest)
heart <- read.csv(url('http://archive.ics.uci.edu/ml/machine-learning-databases/echocardiogram/echocardiogram.data'),na.strings = "?")
names <- names(heart)
nrow(heart)
ncol(heart)
names(heart)
colnames(heart)[colnames(heart)=="X11"] <- "survival"
colnames(heart)[colnames(heart)=="X0"] <- "alive"
colnames(heart)[colnames(heart)=="X71"] <- "attackAge"
colnames(heart)[colnames(heart)=="X0.1"] <- "pericardialEffusion"
colnames(heart)[colnames(heart)=="X0.260"] <- "fractionalShortening"
colnames(heart)[colnames(heart)=="X9"] <- "epss"
colnames(heart)[colnames(heart)=="X4.600"] <- "lvdd"
colnames(heart)[colnames(heart)=="X14"] <- "wallMotionScore"
colnames(heart)[colnames(heart)=="X1"] <- "wallMotionIndex"
colnames(heart)[colnames(heart)=="X1.1"] <- "mult"
colnames(heart)[colnames(heart)=="name"] <- "patientName"
colnames(heart)[colnames(heart)=="X1.2"] <- "group"
colnames(heart)[colnames(heart)=="X0.2"] <- "aliveAfterYear"
names(heart)
heart1 <- na.omit(heart)
while importing file you can specify na.string as ? and later using na.omit you can remove all the ? or NA strings
I think this can do what you want.
# Do not forget to set stringsAsFactors as false to the read.csv
# as to make string comparison efficient
heart <- read.csv(url('http://archive.ics.uci.edu/ml/machine-learning-databases/echocardiogram/echocardiogram.data'),stringsAsFactors = F)
# Simpler way to assign column names to the dataframe
colnames(heart) <- c("survival", "alive", "attackAge", "pericardialEffusion",
"fractionalShortening", "epss", "lvdd", "wallMotionScore",
"wallMotionIndex", "mult", "patientName",
"group", "aliveAfterYear")
# You can traverse a dataframe as a matrix using the row and column index
# as coordinates
for(r in 1:nrow(heart)){
for(c in 1:ncol(heart)){
# For this particular cell you do a comparison
# substituting the ? with NA which is the default missing value
# in R
heart[r,c] <- ifelse(heart[r,c]=="?",NA,heart[r,c])
}
}
# omit the NA rows
heart <- na.omit(heart)
Some libraries support reading csv files and specifying strings to be read as missing values. I use the readr library most often. Then you can just use na.omit and similar functions.
library(readr)
library(dplyr)
heart <- read_csv(
'http://archive.ics.uci.edu/ml/machine-learning-databases/echocardiogram/echocardiogram.data',
na=c("", "?")
)
colnames(heart) <- recode(
colnames(heart),
"X11" = "survival",
"X0" = "alive",
"X71" = "attackAge",
"X0.1" = "pericardialEffusion",
"X0.260" = "fractionalShortening",
"X9" = "epss",
"X4.600" = "lvdd",
"X14" = "wallMotionScore",
"X1" = "wallMotionIndex",
"X1.1" = "mult",
"name" = "patientName",
"X1.2" = "group",
"X0.2" = "aliveAfterYear"
)
heart
heart <- na.omit(heart)
(Also you can spare some typing with the recode function from the dplyr package, but your solution for renaming the columns works as good.)

Efficient way to split strings in separate rows (creating an edgelist)

I currently have the following problem. I work with Web-of-Science scientific publication and citation data, which has the following structure: A variable "SR" is a string with the name of a publication, "CR" a variable with a string containing all cited references in the article, separated by a ";".
My task now is to create an edgelist between all publications with the corresponding citations, where every publication and citation combination is in a single row. I do it currently with the following code:
# Some minimal data for example
pub <- c("pub1", "pub2", "pub3")
cit <- c("cit1;cit2;cit3;cit4","cit1;cit4;cit5","cit5;cit1")
M <- cbind(pub,cit)
colnames(M) <- c("SR","CR")
# Create an edgelist
cit_el <- data.frame() #
for (i in seq(1, nrow(M), 1)) { # i=3
cit <- data.frame(strsplit(as.character(M[i,"CR"]), ";", fixed=T), stringsAsFactors=F)
colnames(cit)[1] <- c("SR")
cit$SR_source <- M[i,"SR"]
cit <- unique(cit)
cit_el <- rbind(cit_el, cit)
}
However, for large datasets of some 10k+ of publications (which tend to have 50+ citations), the script runs 15min+. I know that loops are usually an inefficient way of coding in R, yet didn't find an alternative that produces what I want.
Anyone knows some trick to make this faster?
This is my attempt. I haven't compared the speeds of different approaches yet.
First is the artificial data with 10k pubs, 100k possible citations, max is 80 citations per pub.
library(data.table)
library(stringr)
pubCount = 10000
citCount = 100000
maxCitPerPub = 80
pubList <- paste0("pub", seq(pubCount))
citList <- paste0("cit", seq(citCount))
cit <- sapply(sample(seq(maxCitPerPub), pubCount, replace = TRUE),
function(x) str_c(sample(citList, x), collapse = ";"))
data <- data.table(pub = pubList,
cit = cit)
For processing, I use stringr::str_split_fixed to split the citations into columns and use data.table::melt to collapse the columns.
temp <- data.table(pub = pubList, str_split_fixed(data$cit, ";", maxCitPerPub))
result <- melt(temp, id.vars = "pub")[, variable:= NULL][value!='']
Not sure if this is any quicker but if I'm understanding correctly this should give the desired result
rbindlist(lapply(1:nrow(M), function(i){
data.frame(SR_source = M[i, 'SR'], SR = strsplit(M[i, 'CR'], ';'))
}))

Removing rows based on character conditions in a column

Good morning, I have created the following R code:
setwd("xxx")
library(reshape)
##Insert needed year
url <- "./Quarterly/1990_qtrly.csv"
##Writes data in R with applicable columns
qtrly_data <- read.csv(url, header = TRUE, sep = ",", quote="\"", dec=".", na.strings=" ", skip=0)
relevant_cols <- c("area_fips", "industry_code", "own_code", "agglvl_code", "year", "qtr")
overall <- c(relevant_cols, colnames(qtrly_data)[8:16])
lq <- c(relevant_cols, colnames(qtrly_data)[17:25])
oty <- c(relevant_cols, colnames(qtrly_data)[18:42])
types <- c("overall", "lq", "oty")
overallx <- colnames(qtrly_data)[9:16]
lqx <- colnames(qtrly_data)[18:25]
otyx <- colnames(qtrly_data)[seq(27,42,2)]
###Adding in the disclosure codes from each section
disc_codes <- c("disclosure_code", "lq_disclosure_code", "oty_disclosure_code")
cols_list = list(overall, lq, oty)
denom_list = list(overallx, lqx, otyx)
##Uses a two-loop peice of code to go through data denominations and categories, while melting it into the correct format
for (j in 1:length(types))
{
cat("Working on type: " , types[j], "\n")
these_denominations <- denom_list[[j]]
type_data <- qtrly_data[ , cols_list[[j]] ]
QCEW_County <- melt(type_data, id=c(relevant_cols, disc_codes[j]))
colnames(QCEW_County) <- c(relevant_cols, "disclosure_code", "text_denomination", "value")
Data_Cat <- j
for (k in 1:length(these_denominations))
{
cat("Working on type: " , types[j], "and denomination: ", these_denominations[k], "\n")
QCEW_County_Denominated <- QCEW_County[QCEW_County[, "text_denomination"] == these_denominations[k], ]
QCEW_County_Denominated$disclosure_code <- ifelse(QCEW_County_Denominated$disclosure_code == "", 0, 1)
Data_Denom <- k
QCEW_County_Denominated <- cbind(QCEW_County_Denominated, Data_Cat, Data_Denom)
QCEW_County_Denominated$Source_ID <- 1
QCEW_County_Denominated$text_denomination <- NULL
colnames(QCEW_County_Denominated) <- NULL
###Actually writes the txt file to the QCEW folder
write.table(QCEW_County_Denominated, file="C:\\Users\\jjackson\\Downloads\\QCEW\\1990_test.txt", append=TRUE, quote=FALSE, sep=',', row.names=FALSE)
}
}
Now, there are some things I need to get rid of, namely, all the rows in my QCEW_County_Denominated dataframe where the "area_fips" column begins with the character "C", in that same column, there are also codes that start with US that I would like to replace with a 0. Finally, I also have the "industry_code" column that in my final dataframe has 3 values that need to be replaced. 31-33 with 31, 44-45 with 44, and 48-49 with 48. I understand that this is a difficult task. I'm slowly figuring it out on my own, but if anyone could give me a helpful nudge in the right direction while I'm figuring this out on my own, it would be much appreciated. Conditional statements in R is looking like it's my Achilles heel, as it's always where I begin to get confused with how its syntax differs from other statistical packages.
Thank you, and have a nice day.
You can remove and recode your data using regex and subsetting.
Using grepl, you can select the rows in the column area_fips that DON'T start with C.
QCEW_County_Denominated <- QCEW_County_Denominated[!grepl("^C", QCEW_County_Denominated$area_fips), ]
Using gsub, you can replace with 0 the values in the area_fips columns that start with 0.
QCEW_County_Denominated$area_fips <- as.numeric(gsub("^US", 0, QCEW_County_Denominated$area_fips))
Finally, using subsetting you can replace the values in the industry_code.
QCEW_County_Denominated$industry_code[QCEW_County_Denominated$industry_code == "31-33"] <- 31
QCEW_County_Denominated$industry_code[QCEW_County_Denominated$industry_code == "44-45"] <- 44
QCEW_County_Denominated$industry_code[QCEW_County_Denominated$industry_code == "48-49"] <- 48

Replace loop with apply function

I'm using a for loop to create a document term matrix. My actual problem uses an obscure package called RMeCab to tokenize Japanese text, but here a more standard equivalent using strsplit. My current code:
Documents <- data.frame(Names= c("A","B"),Texts=c("A string of words","A different string"), stringsAsFactors = FALSE)
OUTPUT <- NULL
COMBINED <- NULL
i <- 1
for (i in 1:length(Documents$Texts)){
OUTPUT <- data.frame(unlist(strsplit(Documents$Texts, " ")))
OUTPUT$doc <- Documents$Names[i]
COMBINED <- rbind(COMBINED, OUTPUT)
}
Document_Term_Matrix <- as.data.frame.matrix(table(COMBINED))
It works, but I'd like to use a more efficient apply function. If I run
L_OUTPUT <- lapply(Documents[,2],function(x) strsplit(x, " "))
I get the separate words as elements of a list, but how do I append the document name from Documents$Names?
More specifically with a list structure:
[[1]]
これ です は ぺん
1 1 1 1
[[2]]
です は 人 彼
1 1 1 1
How do I get a data with a column like this
これ は ぺん です 彼 は 人 です
And the second column showing the documents names
One One One One Two Two Two Two
Those words corresponding to the list elements [[1]], [[2]], etc.
It is better to use packages such as tm for these kind of operations, but here is a solution using base R,
list1 <- strsplit(Documents$Texts, ' ')
v1 <- unique(unlist(list1))
Document_Term_Matrix <- as.data.frame(t(sapply(v1, function(i) lapply(list1, function(j)
sum(grepl(i, j))))))
names(Document_Term_Matrix)<- Documents$Names
Document_Term_Matrix
# A B
#A 1 1
#string 1 1
#of 1 0
#words 1 0
#different 0 1
you can use functions from tm package which are suitable for large text datasets:
library(tm)
# create corpora from your documents
corp = VCorpus(DataframeSource(Documents), readerControl = list(reader = readTabular(mapping = list(content = "Texts", id = "Names"))))
# create term document matrix
tdm = TermDocumentMatrix(corp, control = list(tokenize = function(x) unlist(strsplit(as.character(x), "[[:space:]]+"))
, stopwords = FALSE
, tolower = TRUE
, weighting = weightTf))
inspect(tdm)
# get the result as matrix
tdm.m = matrix(tdm, nrow = tdm$nrow, ncol = tdm$ncol)
rownames(tdm.m) = tdm$dimnames$Terms
colnames(tdm.m) = tdm$dimnames$Docs
I also think there is a mistake in your question (but I cannot add comments).
You're missing [i] in your for cycle, so you get the total number of terms in all the documents. It should be something like this:
OUTPUT <- data.frame(unlist(strsplit(Documents$Texts[i], " ")))

Resources