generate variable names in for loop - r

Hope you don't mind if this is too easy for you.
In R, I am using fromJSON() to read from 3 urls (tier 1 url) , in the JSON file there is "link" field which give me another url (tier 2 url) and I use that and read.table() to get my final data. My code now is like this:
# note, this code does not run
urlJohn <- www.foo1.com
urlJane <- www.foo2.com
urlJoe <- www.foo3.com
tempJohn <- fromJson(urlJohn)
tempJohn[["data"]][["rows"]]$link %<>%
{clean up this data}
dataJohn <- read.table(tempJohn[["data"]][["rows"]]$link,
header = TRUE,
sep = ",")
tempJane <- fromJson(urlJane)
tempJane[["data"]][["rows"]]$link %<>%
{clean up this data}
dataJane <- read.table(tempJane[["data"]][["rows"]]$link,
header = TRUE,
sep = ",")
tempJoe <- fromJson(urlJoe)
tempJoe[["data"]][["rows"]]$link %<>%
{clean up this data}
dataJoe <- read.table(tempJoe[["data"]][["rows"]]$link,
header = TRUE,
sep = ",")
As you can see, I am just copying-n-pasting code blocks. What I wish is this:
# note, this code also does not run
urlJohn <- www.foo1.com
urlJane <- www.foo2.com
urlJoe <- www.foo3.com
source <- c("John", "Jane", "joe")
for (i in source){
temp <- paste(temp, i, sep = "")
url <- paste(url, i, sep = "")
data <- paste(data, i, sep = "")
temp <- fromJson(url)
temp[["data"]][["rows"]]$link %<>%
{clean up this data}
data <- read.table(temp[["data"]][["rows"]]$link,
header = TRUE,
sep = ",")
}
What do I need to do to make the for loop work? If my question is not clear, please ask me to clarify it.

I usually find using lapply convenient than a for loop. Although you can easily convert this to a for loop if needed.
URLs <- c('www.foo1.com', 'www.foo2.com', 'www.foo3.com')
lapply(URLs, function(x) {
temp <- jsonlite::fromJSON(x)
temp[["data"]][["rows"]]$link %<>% {clean up this data}
read.table(temp[["data"]][["rows"]]$link,header = TRUE,sep = ",")
}) -> list_data
list_data

Thanks to #Ronak Shah. The R community strongly favors "non-For-loop" solution.
The way to get my desired result is lapply.
Below is non-running codes in mnemonics:
URLs <- c('www.foo1.com', 'www.foo2.com', 'www.foo3.com')
lapply(URLs, function(x) {
temp <- jsonlite::fromJSON(x)
x <- temp[["data"]][["rows"]]$link %<>% {clean up this data}
y <- read.table(temp[["data"]][["rows"]]$link,header = TRUE,sep = ",")
return(list(x, y))
})
And this is a running example.
x <- list(alpha = 1:10,
beta = exp(-3:3),
logic = c(TRUE,FALSE,FALSE,TRUE))
lapply(x, function(x){
temp <- sum(x) / 2
temp2 <- list(x,
temp)
return(temp2)
}
)

Related

Working with large csv file in R

any help will be appreciated.
I used the following code to break down my large csv file (4gb) and now I am trying to save the 2nd, 3rd... part into a csv. However, I can only access the first chunk of my data.
Is there anything wrong with my code?
How do I save the second chunk of my data into csv?
rgfile <- 'filename.csv'
index <- 0
chunkSize <- 100000
con <- file(description = rgfile, open="r")
dataChunk <- read.table(con, nrows= chunkSize, header=T, fill= TRUE, sep= ",")
actualColumnNames <- names(dataChunk)
repeat {
index <- index + 1
print(paste('Processing rows:', index * chunkSize))
if (nrow(dataChunk) != chunkSize){
print('Processed all files!')
break
}
dataChunk <- read.table(
con, nrows = chunkSize, skip=0, header = FALSE,
fill=TRUE, sep = ",", col.names=actualColumnNames
)
break
}
library(tidyverse)
library(nycflights13)
# make the problelm reproducible
rgfile <- 'flights.csv'
write_csv(flights, rgfile)
# now, get to work
lines <- as.numeric(R.utils::countLines(rgfile))
chunk_size <- 100000
hdr <- read_csv(rgfile, n_max=2)
fnum <- 1
for (i in seq(1, lines, chunk_size)) {
suppressMessages(
read_csv(
rgfile, col_names=colnames(hdr), skip=(i-1), n_max=chunk_size
)
) -> x
if (i>1) colnames(x) <- colnames(hdr)
write_csv(x, sprintf("file%03d.csv", fnum))
fnum <- fnum + 1
}

read tables and assign to a string in a loop in R

I'm sure there is a trivial answer to this but I can't seem to find the right code. I have a list of files and a list of strings that I would like to assign the contents of those files to as dataframes. Then I would like to perform other things on the dataframes within the same loop. I also need to keep each dataframe for downstream work. here is my code:
samples <- c('fc14','g14','fc18','g18','fc21','g21')
fc_samples <- grep("fc", samples, value=TRUE)
fc_files <- c('fc14_g14_full_annot_uniq.txt','fc18_g18_full_annot_uniq.txt','fc21_g21_full_annot_uniq.txt')
# make dataframes
for (file in fc_files)
{ fc_n <- 1
g_n <- 1
print(file);
# THE BIT THAT DOESN'T WORK
assign(paste("data", fc_samples[fc_n], sep='_'), read.table(file,sep = "\t", header=T));
# HERE I EXPECT THE TOP OF MY DF TO BE PRINTED BUT IT ISN'T
head(data_fc14);
# I TRY THIS INSTEAD
do.call("<-",list(paste("data", fc_samples[fc_n], sep='_'), read.table(file,sep = "\t", header=T)))
# I TRY TO PRINT THE DF AGAIN BUT STILL NO LUCK
head(paste("data", fc_samples[fc_n], sep='_'))
# FIRST DOWNSTREAM THING I WOULD LIKE TO DO,
# WON'T WORK UNTIL I SOLVE THE DF ASSIGNMENT ISSUE
names(paste("data", fc_samples[fc_n], sep='_'))[names(paste("data", fc_samples[fc_n], sep='_'))==c('SAMPLE_fc','CHROM_fc','START_fc','REF_fc','ALT_fc','REGION_fc','DP_fc','FREQ_fc','GENE_fc','AFFECTS_fc','dbSNP_fc',
# 'NOVEL_fc')] <- c('SAMPLE','CHROM','START','REF','ALT','REGION','DP','FREQ','GENE','AFFECTS','dbSNP','NOVEL')
# ITERATE TO THE NEXT FILE
fc_n <- fc_n+1
}
I tried solutions from here and here but it didn't help.
If anyone has an elegant solution to this then that would be great! Thanks in advance!
Fixing your code:
samples <- c('fc14','g14','fc18','g18','fc21','g21')
fc_samples <- grep("fc", samples, value=TRUE)
# Make dummy example files
fc_files <- file.path("example-data", c(
'fc14_g14_full_annot_uniq.txt','fc18_g18_full_annot_uniq.txt',
'fc21_g21_full_annot_uniq.txt'))
set.seed(123) ; dummy_df <-
setNames(
as.data.frame(replicate(12, rnorm(7))),
c('SAMPLE_fc','CHROM_fc','START_fc','REF_fc','ALT_fc','REGION_fc',
'DP_fc','FREQ_fc','GENE_fc','AFFECTS_fc','dbSNP_fc','NOVEL_fc')
)
if (!dir.exists("./example-data")) dir.create("example-data")
invisible({
lapply(fc_files, write.table, x = dummy_df, sep = "\t")
})
# "fc_n <- 1" should be outside the loop:
fc_n <- 1
for (file in fc_files) {
g_n <- 1
assign(paste("data", fc_samples[fc_n], sep='_'),
read.table(file,sep = "\t", header=T))
# Copy data to be able to change its names
f <- get(paste("data", fc_samples[fc_n], sep='_'))
names(f)[names(f) == c('SAMPLE_fc','CHROM_fc','START_fc',
'REF_fc','ALT_fc','REGION_fc',
'DP_fc','FREQ_fc','GENE_fc','AFFECTS_fc',
'dbSNP_fc','NOVEL_fc')] <-
c('SAMPLE','CHROM','START','REF','ALT','REGION','DP','FREQ',
'GENE','AFFECTS','dbSNP','NOVEL')
# Assign it back, now that names have been changed
assign(paste("data", fc_samples[fc_n], sep='_'), f)
fc_n <- fc_n+1
}
A "more elegant" way:
assign()ing is not considered best practice, rather work with lists.
Though I occasionally use it myself, there are sometimes good reasons to.
# For the '%>%' pipe
library(magrittr)
data <-
samples %>%
grep(pattern = "fc", value = TRUE) %>%
setNames(nm = .) %>%
lapply(grep, x = fc_files, value = TRUE) %>%
lapply(read.table, sep = "\t", header = TRUE) %>%
lapply(function(f) setNames(f, sub("_fc", "", names(f))))
identical(data_fc14, data$fc14)
# [1] TRUE
identical(data_fc18, data$fc18)
# [1] TRUE
identical(data_fc21, data$fc21)
# [1] TRUE
# Clean up
print(unlink("example-data", recursive = TRUE))
samples <- c('fc14','g14','fc18','g18','fc21','g21')
fc_samples <- grep("fc", samples, value=TRUE)
fc_files <- c('fc14_g14_full_annot_uniq.txt','fc18_g18_full_annot_uniq.txt','fc21_g21_full_annot_uniq.txt')
g_files <- c('g14_full_annot_uniq.txt','g18_full_annot_uniq.txt','g21_full_annot_uniq.txt')
# make dataframes
df_names <- c("data_fc14","data_fc18","data_fc21")
fc_n <- 1
for (file in fc_files)
{
assign(df_names[fc_n], read.table(file,sep = "\t", header=T)); #WORKS
#do.call("<-",list(paste("data", fc_samples[fc_n], sep='_'), read.table(file,sep = "\t", header=T))); #ALSO WORKS
print(head(df_names[fc_n]))
print(head(eval(as.symbol(df_names[fc_n]))))
df <- eval(as.symbol(df_names[fc_n]))
names(df)[names(df) == c('SAMPLE_fc','CHROM_fc','START_fc','REF_fc','ALT_fc','REGION_fc','DP_fc','FREQ_fc','GENE_fc','AFFECTS_fc','dbSNP_fc',
'NOVEL_fc')] <- c('SAMPLE','CHROM','START','REF','ALT','REGION','DP','FREQ','GENE','AFFECTS','dbSNP','NOVEL')
assign(df_names[fc_n], df)
print(head(eval(as.symbol(df_names[fc_n]))))
print(file);
fc_n <- fc_n+1
}
Thanks to all that helped, I solved it using the advise from "apom" in the end as it is most intuitive for more novice R users.

R solving hackerrank challenge

I would like to solve the challenge. The language of my preference is R. I am not sure how to receive input. On hackerrank coding window it says that
"# Enter your code here. Read input from STDIN. Print output to STDOUT"
So far I am used to receiving input by using
v1 <- readline("Enter two integers: ")
How should i receive input on hackerrank? I tried to see solved examples but couldn't find any solved examples.
update 1
Below code works in R. Only problem is number of steps and ball values are not provided from keyboard input. We have to update them manually on line 1 and line2. How could I get update below solution so that it works on hackerrank?
steps=4
ball_numbers=c(1,2,2,2)
d=as.data.frame(c(0,1))
for (i in (1:(length(ball_numbers)-1)))
{
assign(x = paste("A", i, sep = ""),value = c(0,1))
e <- as.data.frame(get(paste("A", i, sep = "")))
colnames(e) <- paste("A", i, sep="")
d <- merge(d,e)
}
d=as.matrix(t(d))
answer=sum(ball_numbers %*% d)/ncol(d)
update2
Below code produces correct answer
# Enter your code here. Read input from STDIN. Print output to STDOUT
nums <- read.table("/dev/stdin", sep=" ");
nums <- as.matrix(as.data.frame(t(nums)))
steps=nums[1]
ball_numbers=nums[2:length(nums)]
d=as.data.frame(c(0,1))
for (i in (1:(length(ball_numbers)-1)))
{
assign(paste("A", i, sep = ""),value = c(0,1))
e <- as.data.frame(get(paste("A", i, sep = "")))
colnames(e) <- paste("A", i, sep="")
d <- merge(d,e)
}
d=as.matrix(t(d))
#answer=as.numeric(format(round(sum(ball_numbers %*% d)/ncol(d),1),nsmall=1))
answer = print(format(sum(ball_numbers %*% d)/ncol(d),nsmall=1, digits = 1), quote = F)
write.table(as.numeric(answer), sep = "", append=T, row.names = F, col.names = F,quote = FALSE,)
I get below output
[1] 2.0
2
which is different from expected output which is below. How can i modify my code to get the correct format of output
2.0
Look at the "warmup".
data <- suppressWarnings(read.table("stdin", sep=" "));
Alternatively you can use
data <- suppressWarnings(readLines(file("stdin")))
Also Refer this page in hackerrank
I faced the similar issue for reading input in R in hackerrank . Then to use readLines i used following :
input<-file('stdin', 'r')
x <- readLines(input, n=1)
If u again want to read another data y use same approach :
y <- readLines(input, n=1)
#---this solves the problem
# Enter your code here. Read input from STDIN. Print output to STDOUT
nums <- suppressWarnings(readLines(file("stdin")))
#nums <- suppressWarnings(readLines(file("new.txt")))
nums <- as.matrix(as.data.frame(t(nums)))
class(nums) <- "numeric"
steps=nums[1]
ball_numbers=nums[2:length(nums)]
d=as.data.frame(c(0,1))
for (i in (1:(length(ball_numbers)-1)))
{
assign(paste("A", i, sep = ""),value = c(0,1))
e <- as.data.frame(get(paste("A", i, sep = "")))
colnames(e) <- paste("A", i, sep="")
d <- merge(d,e)
}
d=as.matrix(t(d))
answer=sum(ball_numbers %*% d)/ncol(d)
write.table(cat(format(answer, nsmall=1), sep="\n"), sep = "", append=T, row.names = F, col.names = F)
Another approach:
con = file('stdin', open ='r')
input = readLines(con)
z = c()
for(i in 2:length(input)){
z = c(z, as.numeric(input[[i]]))
}
cat(format(round(sum(z)/2, 1), nsmall = 1), sep = "\n")
A very handy one-liner to read in from standard input is the scan function, for instance:
text <- scan(file = 'stdin', what = 'character', sep = '\r')

How to parse INI like configuration files with R?

Is there an R function for parsing INI like configuration files?
While searching I only found this discussion.
Here is an answer that was given to exact the same question on r-help in 2007 (thanks to #Spacedman for pointing this out):
Parse.INI <- function(INI.filename)
{
connection <- file(INI.filename)
Lines <- readLines(connection)
close(connection)
Lines <- chartr("[]", "==", Lines) # change section headers
connection <- textConnection(Lines)
d <- read.table(connection, as.is = TRUE, sep = "=", fill = TRUE)
close(connection)
L <- d$V1 == "" # location of section breaks
d <- subset(transform(d, V3 = V2[which(L)[cumsum(L)]])[1:3],
V1 != "")
ToParse <- paste("INI.list$", d$V3, "$", d$V1, " <- '",
d$V2, "'", sep="")
INI.list <- list()
eval(parse(text=ToParse))
return(INI.list)
}
Actually, I wrote a short and presumably buggy function (i.e. not covering all corner cases) which works for me now:
read.ini <- function(x) {
if(length(x)==1 && !any(grepl("\\n", x))) lines <- readLines(x) else lines <- x
lines <- strsplit(lines, "\n", fixed=TRUE)[[1]]
lines <- lines[!grepl("^;", lines) & nchar(lines) >= 2] # strip comments & blank lines
lines <- gsub("\\r$", "", lines)
idx <- which(grepl("^\\[.+\\]$", lines))
if(idx[[1]] != 1) stop("invalid INI file. Must start with a section.")
res <- list()
fun <- function(from, to) {
tups <- strsplit(lines[(from+1):(to-1)], "[ ]*=[ ]*")
for (i in 1:length(tups))
if(length(tups[[i]])>2) tups[[i]] <- c(tups[[i]][[1]], gsub("\\=", "=", paste(tail(tups[[i]],-1), collapse="=")))
tups <- unlist(tups)
keys <- strcap(tups[seq(from=1, by=2, length.out=length(tups)/2)])
vals <- tups[seq(from=2, by=2, length.out=length(tups)/2)]
sec <- strcap(substring(lines[[from]], 2, nchar(lines[[from]])-1))
res[[sec]] <<- setNames(vals, keys)
}
mapply(fun, idx, c(tail(idx, -1), length(lines)+1))
return(res)
}
where strcap is a helper function that capitalizes a string:
strcap <- function(s) paste(toupper(substr(s,1,1)), tolower(substring(s,2)), sep="")
There are also some C solutions for this, like inih or libini that might be useful. I did not try them out, though.

How to create a custom write.table function?

I can use write.table function to create an output data from a data.frame:
> write.table(head(cars), sep = "|", row.names=FALSE)
"speed"|"dist"
4|2
4|10
7|4
7|22
8|16
9|10
How can I create my own write.table function which creates an output like this (header with double pipes and data with preceding and succeeding pipes)?:
||"speed"||"dist"||
|4|2|
|4|10|
|7|4|
|7|22|
|8|16|
|9|10|
write.table can get you part of the way, but you will still need to do some fiddling around to get things to work just as you want.
Here's an example:
x <- capture.output(
write.table(head(cars), sep = "|", row.names = FALSE, eol = "|\n"))
x2 <- paste0("|", x)
x2[1] <- gsub("|", "||", x2[1], fixed=TRUE)
cat(x2, sep = "\n")
# ||"speed"||"dist"||
# |4|2|
# |4|10|
# |7|4|
# |7|22|
# |8|16|
# |9|10|
As a function, I guess in its most basic form it could look something like:
write.myOut <- function(inDF, outputFile) {
x <- capture.output(
write.table(inDF, sep = "|", row.names = FALSE, eol = "|\n"))
x <- paste0("|", x)
x[1] <- gsub("|", "||", x[1], fixed=TRUE)
cat(x, sep = "\n", file=outputFile)
}
I don't think that it is possible with write.table. Here is a workaround:
# function for formatting a row
rowFun <- function(x, sep = "|") {
paste0(sep, paste(x, collapse = sep), sep)
}
# create strings
rows <- apply(head(cars), 1, rowFun)
header <- rowFun(gsub("^|(.)$", "\\1\"", names(head(cars))), sep = "||")
# combine header and row strings
vec <- c(header, rows)
# write the vector
write(vec, sep = "\n", file = "myfile.sep")
The resulting file:
||"speed"||"dist"||
|4|2|
|4|10|
|7|4|
|7|22|
|8|16|
|9|10|

Resources