Read file into R keeping end of lines - r

Probably a simple question and I have looked at the many options in scan but havent got what I want yet.
A simple example would be
require(httr)
example <- content(GET("http://www.r-project.org"), as = 'text')
write(example, 'text.txt')
input <- readLines('text.txt')
> example
[1] "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\">\n<html>\n<head>\n<title>The R Project for Statistical Computing</title>\n<link rel=\"icon\" href=\"favicon.ico\" type=\"image/x-icon\">\n<link rel=\"shortcut icon\" href=\"favicon.ico\" type=\"image/x-icon\">\n<link rel=\"stylesheet\" type=\"text/css\" href=\"R.css\">\n</head>\n\n<FRAMESET cols=\"1*, 4*\" border=0>\n<FRAMESET rows=\"120, 1*\">\n<FRAME src=\"logo.html\" name=\"logo\" frameborder=0>\n<FRAME src=\"navbar.html\" name=\"contents\" frameborder=0>\n</FRAMESET>\n<FRAME src=\"main.shtml\" name=\"banner\" frameborder=0>\n<noframes>\n<h1>The R Project for Statistical Computing</h1>\n\nYour browser seems not to support frames,\nhere is the contents page of the R Project's\nwebsite.\n</noframes>\n</FRAMESET>\n\n\n\n"
input
[1] "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\">"
[2] "<html>"
[3] "<head>"
[4] "<title>The R Project for Statistical Computing</title>"
[5] "<link rel=\"icon\" href=\"favicon.ico\" type=\"image/x-icon\">"
[6] "<link rel=\"shortcut icon\" href=\"favicon.ico\" type=\"image/x-icon\">"
[7] "<link rel=\"stylesheet\" type=\"text/css\" href=\"R.css\">"
[8] "</head>"
[9] ""
[10] "<FRAMESET cols=\"1*, 4*\" border=0>"
[11] "<FRAMESET rows=\"120, 1*\">"
[12] "<FRAME src=\"logo.html\" name=\"logo\" frameborder=0>"
[13] "<FRAME src=\"navbar.html\" name=\"contents\" frameborder=0>"
[14] "</FRAMESET>"
[15] "<FRAME src=\"main.shtml\" name=\"banner\" frameborder=0>"
[16] "<noframes>"
[17] "<h1>The R Project for Statistical Computing</h1>"
[18] ""
[19] "Your browser seems not to support frames,"
[20] "here is the contents page of the R Project's"
[21] "website."
[22] "</noframes>"
[23] "</FRAMESET>"
[24] ""
[25] ""
[26] ""
[27] ""
the motivation for this is that I want to store various files in Postgresql and I am passing them in in the format given by example as opposed to input. Apologies if I havent explained very well.
#Hong Ooi gave a nice answer using readChar. I have encoding issues so have had to wrap
iconv(readChar(file, nchars=file.info(file)["size"], TRUE), from = "latin1", to = "UTF-8")
to stop the database complaining.

If you want all those strings concatenated into a single string:
paste(input, collapse="\n")
Alternatively, if you're reading from a file and want to avoid splitting the input into bits and putting them back together:
f <- readChar(file, nchars=file.info(file)["size"], TRUE)

Related

List.files based on numbers

I am trying to create a list of files on which I want to run a function. I created a pattern which matches 35 files which I want to use.
mypattern <- paste0("NBS_NLoans_since2009_", seq(1, 35),".xls")
[1] "NBS_NLoans_since2009_1.xls" "NBS_NLoans_since2009_2.xls" "NBS_NLoans_since2009_3.xls" "NBS_NLoans_since2009_4.xls"
[5] "NBS_NLoans_since2009_5.xls" "NBS_NLoans_since2009_6.xls" "NBS_NLoans_since2009_7.xls" "NBS_NLoans_since2009_8.xls"
[9] "NBS_NLoans_since2009_9.xls" "NBS_NLoans_since2009_10.xls" "NBS_NLoans_since2009_11.xls" "NBS_NLoans_since2009_12.xls"
[13] "NBS_NLoans_since2009_13.xls" "NBS_NLoans_since2009_14.xls" "NBS_NLoans_since2009_15.xls" "NBS_NLoans_since2009_16.xls"
[17] "NBS_NLoans_since2009_17.xls" "NBS_NLoans_since2009_18.xls" "NBS_NLoans_since2009_19.xls" "NBS_NLoans_since2009_20.xls"
[21] "NBS_NLoans_since2009_21.xls" "NBS_NLoans_since2009_22.xls" "NBS_NLoans_since2009_23.xls" "NBS_NLoans_since2009_24.xls"
[25] "NBS_NLoans_since2009_25.xls" "NBS_NLoans_since2009_26.xls" "NBS_NLoans_since2009_27.xls" "NBS_NLoans_since2009_28.xls"
[29] "NBS_NLoans_since2009_29.xls" "NBS_NLoans_since2009_30.xls" "NBS_NLoans_since2009_31.xls" "NBS_NLoans_since2009_32.xls"
[33] "NBS_NLoans_since2009_33.xls" "NBS_NLoans_since2009_34.xls" "NBS_NLoans_since2009_35.xls"
Then I used the pattern to get those files from my directory. I got only one file. I have tried different patterns but either I got one file or more than 35 files. Thanks for any suggestion.
list.files(pattern = mypattern)
[1] "NBS_NLoans_since2009_1.xls"

Remove string from a vector in R

I have a vector that looks like
> inecodes
[1] "01001" "01002" "01049" "01003" "01006" "01037" "01008" "01004" "01009" "01010" "01011"
[12] "01013" "01014" "01016" "01017" "01021" "01022" "01023" "01046" "01056" "01901" "01027"
[23] "01019" "01020" "01028" "01030" "01031" "01032" "01902" "01033" "01036" "01058" "01034"
[34] "01039" "01041" "01042" "01043" "01044" "01047" "01051" "01052" "01053" "01054" "01055"
And I want to remove these "numbers" from this vector:
>pob
[1] "01001-Alegría-Dulantzi" "01002-Amurrio"
[3] "01049-Añana" "01003-Aramaio"
[5] "01006-Armiñón" "01037-Arraia-Maeztu"
[7] "01008-Arratzua-Ubarrundia" "01004-Artziniega"
[9] "01009-Asparrena" "01010-Ayala/Aiara"
[11] "01011-Baños de Ebro/Mañueta" "01013-Barrundia"
[13] "01014-Berantevilla" "01016-Bernedo"
[15] "01017-Campezo/Kanpezu" "01021-Elburgo/Burgelu"
[17] "01022-Elciego" "01023-Elvillar/Bilar"
[19] "01046-Erriberagoitia/Ribera Alta"
They are longer that these samples and they don't have the same length. The answer must to be like following:
>pob
[1] "Alegría-Dulantzi" "Amurrio"
[3] "Añana" "Aramaio"
[5] "Armiñón" "Arraia-Maeztu"
[7] "Arratzua-Ubarrundia" "Artziniega"
[9] "Asparrena" "Ayala/Aiara"
[11] "Baños de Ebro/Mañueta" "Barrundia"
[13] "Berantevilla" "Bernedo"
[15] "Campezo/Kanpezu" "Elburgo/Burgelu"
[17] "Elciego" "Elvillar/Bilar"
[19] "Erriberagoitia/Ribera Alta"
Not sure why you needed inecodes at all, since you can use sub to remove all digits:
sub('^\\d+-', '', pob)
Result:
[1] "Alegría-Dulantzi" "Amurrio" "Añana"
[4] "Aramaio" "Armiñón" "Arraia-Maeztu"
[7] "Arratzua-Ubarrundia" "Artziniega" "Asparrena"
[10] "Ayala/Aiara" "Baños de Ebro/Mañueta" "Barrundia"
[13] "Berantevilla" "Bernedo" "Campezo/Kanpezu"
[16] "Elburgo/Burgelu" "Elciego" "Elvillar/Bilar"
[19] "Erriberagoitia/Ribera Alta"
One reason that you might need inecodes is that you have codes in pob that don't exist in inecodes, but that doesn't seem like the case here. If you insist on using inecodes to remove numbers from pob, you can use str_replace_all from stringr:
library(stringr)
str_replace_all(pob, setNames(rep("", length(inecodes)), paste0(inecodes, "-")))
This gives you the exact same result:
[1] "Alegría-Dulantzi" "Amurrio" "Añana"
[4] "Aramaio" "Armiñón" "Arraia-Maeztu"
[7] "Arratzua-Ubarrundia" "Artziniega" "Asparrena"
[10] "Ayala/Aiara" "Baños de Ebro/Mañueta" "Barrundia"
[13] "Berantevilla" "Bernedo" "Campezo/Kanpezu"
[16] "Elburgo/Burgelu" "Elciego" "Elvillar/Bilar"
[19] "Erriberagoitia/Ribera Alta"
Data:
inecodes = c("01001", "01002", "01049", "01003", "01006", "01037", "01008",
"01004", "01009", "01010", "01011", "01013", "01014", "01016",
"01017", "01021", "01022", "01023", "01046", "01056", "01901",
"01027", "01019", "01020", "01028", "01030", "01031", "01032",
"01902", "01033", "01036", "01058", "01034", "01039", "01041",
"01042", "01043", "01044", "01047", "01051", "01052", "01053",
"01054", "01055")
pob = c("01001-Alegría-Dulantzi", "01002-Amurrio", "01049-Añana", "01003-Aramaio",
"01006-Armiñón", "01037-Arraia-Maeztu", "01008-Arratzua-Ubarrundia",
"01004-Artziniega", "01009-Asparrena", "01010-Ayala/Aiara", "01011-Baños de Ebro/Mañueta",
"01013-Barrundia", "01014-Berantevilla", "01016-Bernedo", "01017-Campezo/Kanpezu",
"01021-Elburgo/Burgelu", "01022-Elciego", "01023-Elvillar/Bilar",
"01046-Erriberagoitia/Ribera Alta")
library(stringr)
for(code in inecodes) {
ix <- which(str_detect(pob, code))
pob[ix] <- unlist(str_split(pob, "-", 2))[2]
}
Try this. Match should be much faster
pos<-which(!is.na(pob[match(sub('^([0-9]+)-.*$','\\1',pob),inecodes)]))
pob[pos]<-sub('^[0-9]+-(.*)$','\\1',pob[pos])
Please do post the timings if you manage to get this. Match usually solves many computational issues for large data sets lookup. Would like to see if there are any opposite scenarios.
A bit shorter than sub, str_detect and str_replace is str_remove:
library(stringr)
c("01001-Alegría-Dulantzi", "01002-Amurrio") %>%
str_remove("[0-9]*-")
returns
"Alegría-Dulantzi" "Amurrio"

How to turn rvest output into table

Brand new to R, so I'll try my best to explain this.
I've been playing with data scraping using the "rvest" package. In this example, I'm scraping US state populations from a table on Wikipedia. The code I used is:
library(rvest)
statepop = read_html("https://en.wikipedia.org/wiki/List_of_U.S._states_and_territories_by_population")
forecasthtml = html_nodes(statepop, "td")
forecasttext = html_text(forecasthtml)
forecasttext
The resulting output was as follows:
[2] "7000100000000000000♠1"
[3] " California"
[4] "39,250,017"
[5] "37,254,503"
[6] "7001530000000000000♠53"
[7] "738,581"
[8] "702,905"
[9] "12.15%"
[10] "7000200000000000000♠2"
[11] "7000200000000000000♠2"
[12] " Texas"
[13] "27,862,596"
[14] "25,146,105"
[15] "7001360000000000000♠36"
[16] "763,031"
[17] "698,487"
[18] "8.62%"
How can I turn these strings of text into a table that is set up similar to the way it is presented on the original Wikipedia page (with columns, rows, etc)?
Try using rvest's html_table function.
Note there are five tables on the page thus you will need to specify which table you would like to parse.
library(rvest)
statepop = read_html("https://en.wikipedia.org/wiki/List_of_U.S._states_and_territories_by_population")
#find all of the tables on the page
tables<-html_nodes(statepop, "table")
#convert the first table into a dataframe
table1<-html_table(tables[1])

How to print the error of a system call in R

I'm attempting to use the system function in R to run a program, which I expect to yield an error message in some cases. For this I want to write a tryCatch function.
system(command, intern = TRUE) only returns the actual values which were echo'd by the program I'm running, it does not return my error.
In R, how can I get the error message which was yielded by my system?
My code:
test <- tryCatch({
cmd <- paste0("../Scripts/Plink2/plink --file ../InputData/",prefix," --bmerge ",
"../InputData/fs --missing --out ../InputData/",prefix)
print(cmd)
system(cmd)
} , error = function(e) {
# error handler picks up where error was generated
print("EZEL")
print(paste("MY_ERROR: ",e))
}, finally = {
print("something")
})
[1] "../Scripts/Plink2/plink --file ../InputData/GS80Kdata --bmerge ../InputData/fs --missing --out ../InputData/GS80Kdata"
PLINK v1.90b3.37 64-bit (16 May 2016) https://www.cog-genomics.org/plink2
#....
#skipping some lines here to reduce size
#....
Of these, 1414410 are new, while 2462 are present in the base dataset.
Error: 1 variant with 3+ alleles present.
* If you believe this is due to strand inconsistency, try --flip with
# Skipping some more lines here
[1] "something"
However when using intern=TRUE and assigning the system function to a variable won't catch the error in the vector and still prints it in the R console.
Edit: Here the output of the vector (using gsub to reduce the ridiculous size)
> gsub(pattern="\b\\d.*", replacement = "", x = tst)
[1] "PLINK v1.90b3.37 64-bit (16 May 2016) https://www.cog-genomics.org/plink2"
[2] "(C) 2005-2016 Shaun Purcell, Christopher Chang GNU General Public License v3"
[3] "Logging to ../InputData/GS80Kdata.log."
[4] "Options in effect:"
[5] " --bmerge ../InputData/fs"
[6] " --file ../InputData/GS80Kdata"
[7] " --missing"
[8] " --out ../InputData/GS80Kdata"
[9] ""
[10] "64381 MB RAM detected; reserving 32190 MB for main workspace."
[11] "Scanning .ped file... 0%\b"
[12] "2%\b\b"
[13] "%\b\b"
[14] "\b\b"
[15] "\b"
[16] ""
[17] "58%\b\b"
[18] "7%\b\b"
[19] "%\b\b"
[20] "\b\b"
[21] "\b"
[22] "Performing single-pass .bed write (42884 variants, 14978 people)."
[23] "0%\b"
[24] "../InputData/GS80Kdata-temporary.bim + ../InputData/GS80Kdata-temporary.fam"
[25] "written."
[26] "14978 people loaded from ../InputData/GS80Kdata-temporary.fam."
[27] "144 people to be merged from ../InputData/fs.fam."
[28] "Of these, 140 are new, while 4 are present in the base dataset."
[29] "42884 markers loaded from ../InputData/GS80Kdata-temporary.bim."
[30] "1416872 markers to be merged from ../InputData/fs.bim."
[31] "Of these, 1414410 are new, while 2462 are present in the base dataset."
attr(,"status")
[1] 3
>

Assistance with <name> and <styleUrl> in .kml when using writeOGR() from rgdal

I have a data frame containing coordinates to various locations that I'd like to use with Google Earth. Here's a simple example showing the structure:
data <- data.frame(country = "USA", city = "Saint Paul",
lat = 44.9629, lon = -93.00146)
I followed this SO post and this guide to create KML output successfully using the writeOGR() function from the rgdal package, however I'm having trouble tweaking the attributes. Here's the code:
# you may need to install gdal itself for the package to install successfully
# install.packages("rgdal")
library(rgdal)
data_sp <- data
coordinates(data_sp) <- c("lon", "lat")
proj4string(data_sp) <- CRS("+init=epsg:4238")
data_ll <- spTransform(data_sp, CRS("+proj=longlat +datum=WGS84"))
writeOGR(data_ll["city"], "/path/to/test.kml", driver = "KML", layer = "city")
The result works fine for just viewing locations, but I'd like to change the <styleUrl> attribute as well as have the <name> attribute populated. Without it, Google Earth shows locations with a [no name] attribute:
Here's the resultant .kml file:
<?xml version="1.0" encoding="utf-8" ?>
<kml xmlns="http://www.opengis.net/kml/2.2">
<Document><Folder><name>city</name>
<Placemark>
<ExtendedData><SchemaData schemaUrl="#city">
<SimpleData name="city">Saint Paul</SimpleData>
</SchemaData></ExtendedData>
<Point><coordinates>-93.001753817020003,44.96282130428127</coordinates></Point>
</Placemark>
</Folder>
<Schema name="city" id="city">
<SimpleField name="city" type="string"></SimpleField>
</Schema>
</Document></kml>
I need to either get a <name> element to populate with the SimpleField name="city" contents, or have <name>City</name> tags added to each <Placemark>. What I'd like is something like this as the final result (note added <Style> definition, <styleUrl> attribute for the <Placemark>, and <name> attribute added):
<?xml version="1.0" encoding="utf-8" ?>
<kml xmlns="http://www.opengis.net/kml/2.2">
<Document>
<Style id="custom">
<IconStyle>
<scale>1.5</scale>
<Icon>
<href>http://upload.wikimedia.org/wikipedia/commons/a/af/Tux.png</href>
</Icon>
</IconStyle>
</Style>
<Folder><name>city</name>
<Placemark>
<name>Saint Paul</name>
<styleUrl>#custom</styleUrl>
<ExtendedData><SchemaData schemaUrl="#city">
<SimpleData name="city">Saint Paul</SimpleData>
</SchemaData></ExtendedData>
<Point><coordinates>-93.001753817020003,44.96282130428127</coordinates></Point>
</Placemark>
</Folder>
<Schema name="city" id="city">
<SimpleField name="city" type="string"></SimpleField>
</Schema>
</Document></kml>
Here's what the result looks like (similar to what I'm aiming for):
The rgdal documentation mentions a layer_options attribute, but nothing intuitively stuck out to me...
layer_options = c("<name>????</name>")?
layer_options = c("<styleUrl>#custom</styleUrl")?
Something else?
The attempts above to pass a tag directly don't appear to affect the output.
There's not many examples I found in googling other than creating the default output from writeOGR(), as shown above. Thanks for any suggestions.
To expand on #jlhoward's answer above, I was able to use kmlPoints() to accomplish what I was looking for:
data <- data.frame(country = "USA", city = "Saint Paul",
lat = 44.9629, lon = -93.00146)
# you may need to install gdal itself for the package to install successfully
# install.packages("rgdal")
library(rgdal)
library(maptools)
data_sp <- data
coordinates(data_sp) <- c("lon", "lat")
proj4string(data_sp) <- CRS("+init=epsg:4238")
data_ll <- spTransform(data_sp, CRS("+proj=longlat +datum=WGS84"))
kmlPoints(data_ll["city"], kmlfile = "~/Desktop/test.kml",
name = data_ll$city,
icon = "http://upload.wikimedia.org/wikipedia/commons/a/af/Tux.png")
The output contains both the desired <name> attribute as well as a <Style> definition for the custom icon, which is applied successfully to the <Placemark> entries:
readLines("test.kml")
readLines("test.kml")
[1] "<?xml version=\"1.0\" encoding=\"UTF-8\"?>"
[2] "<kml xmlns=\"http://earth.google.com/kml/2.2\">"
[3] "<Document>"
[4] "<name></name>"
[5] "<description><![CDATA[]]></description>"
[6] ""
[7] "<Style id=\"style1\">"
[8] " <IconStyle>"
[9] " <Icon>"
[10] " <href>http://upload.wikimedia.org/wikipedia/commons/a/af/Tux.png</href>"
[11] " </Icon>"
[12] " </IconStyle>"
[13] "</Style>"
[14] ""
[15] "<Placemark>"
[16] " <name>Saint Paul</name>"
[17] " <description><![CDATA[]]></description>"
[18] " <styleUrl>#style1</styleUrl>"
[19] " <Point>"
[20] " <coordinates>"
[21] "-93.00175381702,44.9628213042813"
[22] " </coordinates>"
[23] " </Point>"
[24] "</Placemark>"
[25] "</Document>"
[26] "</kml>"
The result:
Well, if all you want to do is populate the <name> element in each <Placemark>, this will do it:
library(maptools)
kmlPoints(data_ll,"test.kml",name=data$city)
readLines("test.kml")
# [1] "<?xml version=\"1.0\" encoding=\"UTF-8\"?>"
# [2] "<kml xmlns=\"http://earth.google.com/kml/2.2\">"
# [3] "<Document>"
# [4] "<name></name>"
# ...
# [15] "<Placemark>"
# [16] " <name>Saint Paul</name>"
# [17] " <description><![CDATA[]]></description>"
# [18] " <styleUrl>#style1</styleUrl>"
# [19] " <Point>"
# [20] " <coordinates>"
# [21] "-93.00175381702,44.9628213042813"
# [22] " </coordinates>"
# [23] " </Point>"
# [24] "</Placemark>"
# [25] "</Document>"
# [26] "</kml>"
If you need to change the <Style> as well, then I'm afraid you may have to hack the kml file using the XML package.

Resources