Combine different tables in a list in R - r

update: Code below seems to work
I'm not entire sure to how this question, so I apologise if this is worded badly. I tried looking for "combine different elements of a list using apply" but that doesn't seem to work.
Anyways, as the result of scraping a website, I have two vectors giving identifying information and a list that contains a number of different tables. A simplified version looks something like this:
respondents <- c("A", "B")
questions <- c("question1", "question2")
df1 <- data.frame(
option = c("yes", "no"),
percentage = c(70, 30), stringsAsFactors = FALSE)
df2 <- data.frame(
option= c("today", "yesterday"),
percentage =c(30, 70), stringsAsFactors = FALSE)
df3 <- data.frame(
option = c("yes", "no"),
percentage = c(60, 40), stringsAsFactors = FALSE)
df4 <- data.frame(
option= c("today", "yesterday"),
percentage =c(20, 80), stringsAsFactors = FALSE)
lst <- list(df1, df2, df3, df4)
The first two tables are questions and responses from the first participant, and the second two tables are questions are from the second participant. What i would like to do is to create two tables that contain the answers to the questions for the two participants. So I would like something that looks like this:
question1 <- data.frame(
option = c("yes", "no"),
A = c(70, 30),
B = c(60, 40), stringsAsFactors = FALSE)
question2 <- data.frame(
option = c("today", "yesterday"),
A = c(30, 70),
B = c(20, 80), stringsAsFactors = FALSE)
In my case, I have 122 responses from 51 participants, and it ordered so that tables 1-122 are from the first participant, the next 122 tables are from the second participant, etc. Ultimately, then, I would like to have 122 tables (one table per question), with each table containing 51 columns that correspond to each participant. I am more or less at a loss as to how to do this, so I would appreciate any suggestions.
This should now work:
library("RCurl")
library("XML")
# Get the data
## Create URL address
mainURL <- 'http://www4.uwm.edu/FLL/linguistics/dialect/staticmaps/'
stateURL <- 'states.html'
url <- paste0(mainURL, stateURL)
## Download URL
tmp <- getURL(url)
## Parse
tmp <- htmlTreeParse(tmp, useInternalNodes = TRUE)
## Extract page addresses and save to subURL
subURL <- unlist(xpathSApply(tmp, '//a[#href]', xmlAttrs))
## Remove pages that aren't state's names
subURL <- subURL[-(1:4)]
## Show first four states
head(subURL, 4)
# Get questions
## Select first state
suburl <- subURL[1]
## Paste it at the end of the main URL
url <- paste0(mainURL, suburl)
## Download URL
tmp <- getURL(url)
## Read data from html
tb <- readHTMLTable(tmp, stringsAsFactors = FALSE)
##Remove empty strings
Questions <- Questions[Questions!= '']
# Create objects to populate later
stateNames <- rep('', length(subURL))
## Populate stateNames
### Remove state_ from stateNames
stateNames <- gsub('state_','',subURL)
### Remove .html from stateNames
stateNames <- gsub('.html','',stateNames)
# Remove pictures in the data representing IPA symbols with their names (e.g., names of the pictures)
## Get url
url <- paste0(mainURL, subURL)
tmp <- getURL(url)
## Replace .gif with _
tmp <- gsub(".gif>", '_', tmp)
## Replace "<img\\s+src=./images/" with _
tmp <- gsub("<img\\s+src=./images/", '_', tmp)
# Read in data
tb <- readHTMLTable(tmp, stringsAsFactors = FALSE)
## Subset 2nd and 4th columns and apply to every item on list
tb <- lapply(tb, function(x) x[,c(2,4)])
## Remove quotation marks, percent sign and convert to number; apply to every item
tb <- lapply(tb, function(x) {
x [,2 ] = gsub('\\(','',x[,2] )
x [,2 ] = gsub('%\\)','',x[,2])
x [,2 ] = as.numeric(x[,2])
x
}
)
## Assign column names to all dataframes
tb <- lapply(tb, setNames , nm = c("option", "percentage"))
#get rid of extra tables
tb1 <- tb[-seq(1, length(tb), by=123)]
## Function to clean data sets
f1 <- function(list1){ Reduce(function(...) merge(..., by= 'option', all=TRUE), list1) }; res <- lapply(1:122, function(i) {indx <- seq(i, length(tb), by=122); f1(tb[indx])})
## Function to merge datasets together
res1 <- lapply(1:122, function(i) f1(tb1[seq(i, length(tb1), by=122)]))
## Create names for the states
stateNames2 <- c("option", stateNames)
# Rename columns in the new dataframes
res2 <- lapply(res1, setNames , nm = stateNames2)
# Test to see whether it works
test <- res2[[122]]

Thanks to akrun (see comments), I got this to work. The full code is here:
library("RCurl")
library("XML")
# Get the data
## Create URL address
mainURL <- 'http://www4.uwm.edu/FLL/linguistics/dialect/staticmaps/'
stateURL <- 'states.html'
url <- paste0(mainURL, stateURL)
url
## Download URL
tmp <- getURL(url)
## Parse
tmp <- htmlTreeParse(tmp, useInternalNodes = TRUE)
## Extract page addresses and save to subURL
subURL <- unlist(xpathSApply(tmp, '//a[#href]', xmlAttrs))
## Remove pages that aren't state's names
subURL <- subURL[-(1:4)]
## Show first four states
head(subURL, 4)
# Get questions
## Select first state
suburl <- subURL[1]
## Paste it at the end of the main URL
url <- paste0(mainURL, suburl)
## Download URL
tmp <- getURL(url)
## Read data from html
tb <- readHTMLTable(tmp, stringsAsFactors = FALSE)
## Remove first column
Questions <- tb[[1]][,1]
##Remove empty strings
Questions <- Questions[Questions!= '']
# Create objects to populate later
survey <- vector(length(subURL), mode = "list")
i <- 1
stateNames <- rep('', length(subURL))
## Populate stateNames
### Remove state_ from stateNames
stateNames <- gsub('state_','',subURL)
### Remove .html from stateNames
stateNames <- gsub('.html','',stateNames)
# Remove pictures in the data representing IPA symbols with their names (e.g., names of the pictures)
## Get url
url <- paste0(mainURL, subURL)
tmp <- getURL(url)
## Replace .gif with _
tmp <- gsub(".gif>", '_', tmp)
## Replace "<img\\s+src=./images/" with _
tmp <- gsub("<img\\s+src=./images/", '_', tmp)
# Read in data
tb <- readHTMLTable(tmp, stringsAsFactors = FALSE)
#tb <- tb[-1]
## Subset 2nd and 4th columns and apply to every item on list
tb <- lapply(tb, function(x) x[,c(2,4)])
## Remove quotation marks, percent sign and convert to number; apply to every item
tb <- lapply(tb, function(x) {
x [,2 ] = gsub('\\(','',x[,2] )
x [,2 ] = gsub('%\\)','',x[,2])
x [,2 ] = as.numeric(x[,2])
x
}
)
## Assign column names to all dataframes
tb <- lapply(tb, setNames , nm = c("option", "percentage"))
## Remove unneeded dataframes in list
tb1 <- tb[-seq(1, length(tb), by=123)]
## Function to clean data sets
f1 <- function(list1){ Reduce(function(...) merge(..., by= 'option', all=TRUE), list1) }; res <- lapply(1:122, function(i) {indx <- seq(i, length(tb), by=122); f1(tb[indx])})
## Function to merge datasets together
res1 <- lapply(1:122, function(i) f1(tb1[seq(i, length(tb1), by=122)]))
## Create names for the states
stateNames2 <- c("Options", stateNames)
# Rename columns in the new dataframes
res2 <- lapply(res1, setNames , nm = stateNames2)
# Test to see whether it works
test <- res2[[1]]

Related

Adding an ID column in a list of dataframes in R

I have a list of XML files that I am merging together, but I am trying to figure out how to add an "id" column to each data frame based on the file name.
# BUILD DATAFRAME LIST
list_filenames <- list.files(pattern = ".xml$")
df_list <- lapply(list_filenames, function(f) {
list_ids <- as.list(list_filenames)
doc <- xmlParse(f, useInternalNodes = TRUE)
doc2 <- xmlToDataFrame(doc, nodes = getNodeSet(doc, "//Event"))
mapply(cbind, doc2, "id" = list_ids, SIMPLIFY = F) # Code that kind of works
})
final_df <- do.call(rbind, df_list)
I'm hoping to get something that looks like this:
ex_df <- cbind(x = c(3, 2, 10, 12),
y = c("a", "b", "c", "d"),
id = c("file_name_1", "file_name_1", "file_name_2", "file_name_2")) %>%
as.data.frame()
> ex_df
x y id
1 3 a file_name_1
2 2 b file_name_1
3 10 c file_name_2
4 12 d file_name_2
We can use Map
nm1 <- sub("\\.xml$", "", list_filenames)
out <- do.call(rbind, Map(cbind, df_list, id = nm1))
In the OP's code, we are looping over the list_filenames and then in the second line, using the full set of list_filenames in
as.list(list_filenames)
Instead it would be just 'f' i.e
df_list <- lapply(list_filenames, function(f) {
list_id <- sub("\\.xml$", "", f) #####
doc <- xmlParse(f, useInternalNodes = TRUE)
doc2 <- xmlToDataFrame(doc, nodes = getNodeSet(doc, "//Event"))
doc2$id = list_id
doc2
})
Then, we could rbind the `list elements
do.call(rbind, df_list)

automation to merge data frames adding a line to keep note of the origin

I am a newbie with R. I have 6 different data frames (U, V, W, X, Y, Z), coming from different CSV files, each of them has the same columns (Surname, Name, Winter, Spring, Summer), and I would like to create a new data frame containing the 5 rows and a sixth row which indicates one of the letters (U, V, ...) where the original data comes from. I have tried with the following code:
U <- read.csv(file = "U", header = T)
V <- read.csv(file = "V", header = T)
W <- read.csv(file = "W", header = T)
X <- read.csv(file = "X", header = T)
Y <- read.csv(file = "Y", header = T)
Z <- read.csv(file = "Z", header = T)
U['class'] <- rep("U")
V['class'] <- rep("V")
W['class'] <- rep("W")
X['class'] <- rep("X")
Y['class'] <- rep("Y")
Z['class'] <- rep("Z")
students <- rbind(U, V, W, X, Y, Z)
I would really need to use a loop, so that I can in future go from A to Z. I would like to do something like this, which is totally nonsense.
for(class.name in list(U, V, W, X, Y, Z)){
class.name['class'] <- rep('class')
}
Is there a reasonable way to do it?
Thank you
Edited
To clarify my question, the idea is that I have 6 different stations collecting raw data and giving me 6 different data frames. I want to merge them together, maintaining the information of from which station the raw data comes from.
Possible incomplete solution
Following #MrFlick's advice, I have managed to put everything in one list as follows
classes <- c('U', 'V', 'W', 'X', 'W', 'Z')
my.files <- paste(classes,".csv",sep="")
year.eight <- lapply(my.files, read.csv, header = T)
name(year.eight) <- classes
However, the final outcome should be one single data frame with a further column to indicate which class are the students in. Can someone help me with this, please?
Let me try to share an example
Suppose we have 3 files A.csv, B.csv and C.csv in a folder called "data" within our working directory. Suppose they contain a single column with a numeric value. Then this code does what you want.
library(readr)
files <- paste0("data/", list.files("data"))
df_list <- list()
for (i in seq_along(files)) {
tmp <- read_csv(files[[i]])
tmp["class"] <- sub("\\..*", "", basename(files[[i]])) # ".csv$" also works in this case
df_list[[i]] <- tmp
}
output <- dplyr::bind_rows(df_list)
output
## A tibble: 3 x 2
# x class
# <dbl> <chr>
# 1 1 A
# 2 1 B
# 3 1 C
Edited following Tensibai's excellent suggestion.
To do this more easily with a list of data.frames, it might look something like this
classes <- c('U', 'V', 'W', 'X', 'W', 'Z')
my.files <- paste(classes,".csv",sep="")
year.eight <- mapply(function(path, code) {
data <- read.csv(path, header = T)
data$class <- code
data
}, my.files, classes)
combined <- do.call("rbind", year.eight)
Or using dplyr
classes <- c('U', 'V', 'W', 'X', 'W', 'Z')
my.files <- paste(classes,".csv",sep="")
year.eight <- lapply(my.files, read.csv, header = T)
names(year.eight) <- classes
combined <- dplyr::bind_rows(year.eight, .id="class")
If you save all the files of interest in a specific directory you can then access them using list.files(). Then loop over this using map_df from purrr package. Think this does the trick
#Load package
library(purrr)
#Define the directory where files are saved
path <- "your_file_path/" #e.g. my Mac desktop "~/Desktop/"
#Create vector of file names
files <- list.files(path)
#Use map_df function from purrr to loop over and return a data frame with extra label variable
map_df(files, function(x){
#save as df
df <- read.csv(paste0(path, "/",x))
#use gsub to remove ".csv" from file name
df['class'] <- gsub("\\.csv", "", x)
df
})

Rbinding large list of dataframes after I did some data cleaning on the list

My problem is, that I can't merge a large list of dataframes before doing some data cleaning. But it seems like my data cleaning is missing from the list.
I have 43 xlsx-files, which I've put in a list.
Here's my code for that part:
file.list <- list.files(recursive=T,pattern='*.xlsx')
dat = lapply(file.list, function(i){
x = read.xlsx(i, sheet=1, startRow=2, colNames = T,
skipEmptyCols = T, skipEmptyRows = T)
# Create column with file name
x$file = i
# Return data
x
})
I then did some datacleaning. Some of the dataframes had some empty columns that weren't skipped in the loading and some columns I just didn't need.
Example of how I removed one column (X1) from all dataframes in the list:
dat <- lapply(dat, function(x) { x["X1"] <- NULL; x })
I also applies column names:
colnames <- c("ID", "UDLIGNNR","BILAGNR", "AKT", "BA",
"IART", "HTRANS", "DTRANS", "BELOB", "REGD",
"BOGFD", "AFVBOGFD", "VALORD", "UDLIGND",
"UÅ", "AFSTEMNGL", "NRBASIS", "SPECIFIK1",
"SPECIFIK2", "SPECIFIK3", "PERIODE","FILE")
dat <- lapply(dat, setNames, colnames)
My problem is, when I open the list or look at the elements in the list, my data cleaning is missing.
And I can't bind the dataframes before the data cleaning since they're aren't looking the same.
What am I doing wrong here?
EDIT: Sample data*
# Sample data
a <- c("a","b","c")
b <- c(1,2,3)
X1 <- c("", "","")
c <- c("a","b","c")
X2 <- c(1,2,3)
X1 <- c("", "","")
df1 <- data.frame(a,b,c,X1)
df2 <- data.frame(a,b,c,X1,X2)
# Putting in list
dat <- list(df1,df2)
# Removing unwanted columns
dat <- lapply(dat, function(x) { x["X1"] <- NULL; x })
dat <- lapply(dat, function(x) { x["X2"] <- NULL; x })
# Setting column names
colnames <- c("Alpha", "Beta", "Gamma")
dat <- lapply(dat, setNames, colnames)
# Merging dataframes
df <- do.call(rbind,dat)
So I've just found that with my sample data this goes smoothly.
I had to reopen the list in View-mode to see the changes I made. That doesn't change the fact that when writing to csv and reopening all the data cleaning is missing (haven'tr tried this with my sample data).
I am wondering if it's because I've changed the merge?
# My merge when I wrote this question:
df <- do.call("rbindlist", dat)
# My merge now:
df <- do.call(rbind,dat)
When I use my real data it doesnøt go as smoothly, so I guess the sample data is bad. I don't know what I'm doing wrong so I can't give some better sample data.
The message I get when merging with rbind:
error in rbind(deparse.level ...) numbers of columns of arguments do not match

In R, how can I copy rows from one dataframe to another when the df being copied to has 2 additional columns?

I have a tab delimited text file with 12 columns that I am uploading to my program. I go on to create another dataframe with a structure similar to the one uploaded and add 2 more columns to it.
excelfile = read.delim(ExcelPath)
matchedPictures<- excelfile[0,]
matchedPictures$beforeName <- character()
matchedPictures$afterName <- character()
Now I have a function in which I do the following:
Based on a condition, I obtain the row number pictureMatchNum of the row I need to copy from excelfile to matchedPictures.
I should then copy the row from excelfile to matchedPictures. I tried a couple of different ways so far.
a.
rowNumber = nrow(matchedPictures) + 1
matchedPictures[rowNumber,1:12] <<- excelfile[pictureMatchNum,1:12]
b.
matchedPictures[rowNumber,1:12] <<- rbind(matchedPictures, excelfile[pictureWordMatches,1:12], make.row.names = FALSE)
2a. doesn't seem to work because it copies the indices from the excelfileand uses them as row names in the matchedPictures - which is why I decided to go with rbind
2b. doesn't seem to work because rbind needs to have the columns be identical and matchedPictureshas 2 extra columns.
EDIT START - Including reproducible example.
Here is some reproducible code (with fewer columns and fake data)
excelfile <- data.frame(x = letters, y = words[length(letters)], z= fruit[length(letters)] )
matchedPictures <- excelfile[0,]
matchedPictures$beforeName <- character()
matchedPictures$afterName <- character()
pictureMatchNum1 = match(1, str_detect("A", regex(excelfile$x, ignore_case = TRUE)))
rowNumber1 = nrow(matchedPictures) + 1
pictureMatchNum2 = match(1, str_detect("D", regex(excelfile$x, ignore_case = TRUE)))
rowNumber2 = nrow(matchedPictures) + 1
The 2 options I tried are
2a.
matchedPictures[rowNumber1,1:3] <<- excelfile[pictureMatchNum1,1:3]
matchedPictures[rowNumber1,"beforeName"] <<- "xxx"
matchedPictures[rowNumber1,"afterName"] <<- "yyy"
matchedPictures[rowNumber2,1:3] <<- excelfile[pictureMatchNum2,1:3]
matchedPictures[rowNumber2,"beforeName"] <<- "uuu"
matchedPictures[rowNumber2,"afterName"] <<- "www"
OR
2b.
matchedPictures[rowNumber1,1:3] <<- rbind(matchedPictures, excelfile[pictureMatchNum1,1:3], make.row.names = FALSE)
matchedPictures[rowNumber1,"beforeName"] <<- "xxx"
matchedPictures[rowNumber1,"afterName"] <<- "yyy"
matchedPictures[rowNumber2,1:3] <<- rbind(matchedPictures, excelfile[pictureMatchNum2,1:3], make.row.names = FALSE)
matchedPictures[rowNumber2,"beforeName"] <<- "uuu"
matchedPictures[rowNumber2,"afterName"] <<- "www"
EDIT END
Additionally, I have also seen the suggestions in many places that rather than using empty dataframes, one should have vectors and append data to the vectors and then combine them into a dataframe. Is this suggestion valid when I have so many columns and would need to have 14 separate vectors and copy each one of them individually?
What can I do to make this work?
You could
first determine the row indices of excelfile that match your criteria
extract these rows
then generate the data to fill your columns beforeName and afterName
then append these columns to your new data frame
Example:
excelfile <- data.frame(x = letters, y = words[length(letters)],
z = fruit[length(letters)])
## Vector of patterns:
patternVec <- c("A", "D", "M")
## Look for appropriate rows in file 'excelfile':
indexVec <- vapply(patternVec,
function(myPattern) which(str_detect(myPattern,
regex(excelfile$x, ignore_case = TRUE))), integer(1))
## Extract these rows:
matchedPictures <- excelfile[indexVec,]
## Somehow generate the data for columns 'beforeName' and 'afterName':
## I do not know how this information is generated so I just insert
## some dummy code here:
beforeNameVec <- c("xxx", "uuu", "mmm")
afterNameVec <- c("yyy", "www", "nnn")
## Then assign these variables:
matchedPictures$beforeName <- beforeNameVec
matchedPictures$afterName <- afterNameVec
matchedPictures
# x y z beforeName afterName
# a air dragonfruit xxx yyy
# d air dragonfruit uuu www
# m air dragonfruit mmm nnn
You can make this much simpler by using dplyr
library(dplyr)
library(stringr)
excelfile <- data.frame(x = letters, y = words[length(letters)], z= fruit[length(letters)],
stringsAsFactors = FALSE ) #add stringsAsFactors to have character columns
pictureMatch <- excelfile %>%
#create a match column
mutate(match = ifelse(str_detect(x,"a") | str_detect(x,'d'),1,0)) %>%
#filter to only the columns that match your condition
filter(match ==1)
pictureMatch <- pictureMatch[['x']] #convert to a vector
matchedPictures <- excelfile %>%
filter(x %in% pictureMatch) %>% #grab the rows that match your condition
mutate(beforeName = c('xxx','uuu'), #add your names
afterName = c('yyy','www'))

R extract data from pdf

Here is the data
http://drdpat.bih.nic.in/Downloads/Rice-Varieties-1996-2012.pdf
It is a pdf. If you open the pdf, you will on page 2, there is a table that I need to extract and store it in a dataframe. I followed this link to do this
https://ropensci.org/blog/2016/03/01/pdftools-and-jeroen
library(pdftools)
text <- pdf_text("data.pdf")
dat<-text[2] # this reads the second page
After this, no matter what I try it does not convert it into a tabular format. I
tried this:
dat1 <- matrix(dat, byrow = TRUE,nrow = 12, ncol = 8) # it didn't work
Tried to use the scan function
dat.s <- scan(dat, what = "character", sep = " ", skip = 2) # no use
Can anyone help me with this? Also I am only looking to achieve this in R
Thanks
The structure of the tables in the PDF is a bit messed up: some columns overlap with each other and the tabulizer algorithm does not extract them correctly.
I was only able to extract the first 6 columns from page 2; the last 2 columns (Salient Features, "Recommended for cultivation") remain problematic...
library(tabulizer)
library(dplyr)
out1 <- extract_tables("Rice-Varieties-1996-2012.pdf", pages=2)[[1]]
## With a moderate amount of hacking,
## the following columns are correctly extracted:
## 1. Sl. No.
## 4. Year of Notification
## 5. Duration (in days)
## 6. Eco-System
sel <- gsub(" ","",out1[ ,c(1,4,5,6)])
## To extract Parentage column, you can use the `area` parameter:
## I figured out the values by trial and error
out2 <- extract_tables("Rice-Varieties-1996-2012.pdf", guess=FALSE,
pages=2,
area=list(c(80,120,2000,420) ) )[[1]]
sel <- cbind(sel,out2[1:nrow(sel),1])
## The header is contained in the first 3 rows of `sel`
## which can be aggregated by `paste0`
print(sel)
head <- aggregate(sel[1:3, ], by=list(rep(1,3)), paste0, collapse="") %>%
select(-Group.1)
## The body is a bit harder, because each record might be split across
## a variable number of rows, depending on the entries.
## I have used non-empty records for column 1 (Sl.No.)
## to identify the breakpoints where to split sel into row blocks
## pertaining to the same record.
body <- sel[-(1:3), ]
brks <- body[ ,1]!=""
ibrk <- c((1:nrow(body))[brks], nrow(body)+1)
ll <- unlist(sapply(1:(length(ibrk)-1), function(k) rep(ibrk[k],ibrk[k+1]-ibrk[k])))
stopifnot(length(ll)==nrow(body))
body <- data.frame(body, stringsAsFactors=FALSE)
colnames(body) <- head
tab <- aggregate(body, by=list(ll), paste0, collapse="") %>%
select(-Group.1)
print(tab)
## Using the same trick as above with brks and ibrk,
## one is able to extract column "Name of variety"
## (again, I found the values of area by trial and error).
out3 <- extract_tables("Rice-Varieties-1996-2012.pdf", guess=FALSE,
pages=2,
area=list(c(80,20,2000,130) ) )[[1]]
sel3 <- gsub(" ","",out3)
head3 <- aggregate(sel3[1:2, ], by=list(rep(1,2)), paste0, collapse="") %>%
select(-Group.1)
body3 <- sel3[-(1:2), ]
brks3 <- body3[ ,1]!=""
ibrk3 <- c((1:nrow(body3))[brks3], nrow(body3)+1)
ll3 <- unlist(sapply(1:(length(ibrk3)-1), function(k) rep(ibrk3[k],ibrk3[k+1]-ibrk3[k])))
stopifnot(length(ll3)==nrow(body3))
body3 <- data.frame(body3, stringsAsFactors=FALSE)
colnames(body3) <- head3
tab3 <- aggregate(body3, by=list(ll3), paste0, collapse="") %>%
select(-Group.1)
print(tab3)
## I have not managed to find a value of `area` which correctly splits
## the last two columns *and* allows to identify the rows in each record...
tab <- tab %>% left_join(tab3)

Resources