Error in svytotal() : could not find function "svytotal" - r

I am close to running the script successfully and I got this error in the final script. I am running it on the Rstudio cloud.I computed it by referring PDF: 2020 microdata file to compute estimates and standard errors (RSEs).Page 6,7 in the webpage survey link
Here is my complete script:
# Ref: file:///C:/Users/MMatam/OneDrive%20-%20University%20of%20Central%20Florida/Projects/20230123_US_EIA_DataAnalysis/Residential_BatteryPV_ElectricVehicle_MM/ResidentialEnergyConsumptionSurvey_RECS/microdata-guide.pdf
install.packages("survey")
library(survey)
# Ref: https://stackoverflow.com/questions/54621706/error-in-librarydplyr-there-is-no-package-called-dplyr
install.packages('dplyr')
library(dplyr)
# Import the CSV file from local machine
# Ref: https://community.rstudio.com/t/how-can-i-upload-csv-or-excel-files-existing-in-computer-to-rstudio-cloud/23621
# To import the csv again into this space, right click on the file name and click import dataset
recs2020 <- read_csv(file="recs2020_public_v1.csv")
# Read the
recs2020$NG_MAINSPACEHEAT <- ifelse(recs2020$FUELHEAT == 1, 1, 0)
#
repweights<-select(recs2020,NWEIGHT1:NWEIGHT60)
#
RECS <- svrepdesign(data = recs2020,
weight = ~NWEIGHT,
repweights = repweights,
type = "JK1",
combined.weights = TRUE,
scale = (ncol(repweights)-1)/ncol(repweights),
mse = TRUE)
#
NG_MAINSPACEHEAT<-as.data.frame(svytotal(~NG_MAINSPACEHEAT,RECS))
Present output:
Error in svytotal(~NG_MAINSPACEHEAT, RECS) :
could not find function "svytotal"

library(haven)
library(survey)
sas_url <-
"https://www.eia.gov/consumption/residential/data/2020/sas/recs2020_public_v1.zip"
tf <- tempfile()
download.file( sas_url , tf , mode = 'wb' )
recs_tbl <- read_sas( tf )
recs_df <- data.frame( recs_tbl )
names( recs_df ) <- tolower( names( recs_df ) )
recs_design <-
svrepdesign(
data = recs_df ,
weight = ~ nweight ,
repweights = 'nweight[1-9]+' ,
type = 'JK1' ,
combined.weights = TRUE ,
scale = 59 / 60 ,
mse = TRUE
)
svytotal( ~ as.numeric( fuelheat == 1 ) , recs_design )
# total SE
# as.numeric(fuelheat == 1) 56245389 545591

Related

Difficulty in downloading TCGA data

I am trying to download the TCGA data but I am getting this error:
Error in summarizeMaf(maf = maf, anno = clinicalData, chatty =
verbose): Tumor_Sample_Barcode column not found in provided clinical
data. Rename column containing sample names to Tumor_Sample_Barcode if
necessary.
This is my code:
library("TCGAbiolinks")
library("tidyverse")
library(maftools)
query <- GDCquery( project = "TCGA-LIHC",
data.category = "Clinical",
file.type = "xml",
legacy = FALSE)
GDCdownload(query,directory = ".")
clinical <- GDCprepare_clinic(query, clinical.info = "patient",directory = ".")
#getting the survival time of event data
survival_data <- as_tibble(clinical[,c("days_to_last_followup","days_to_death","vital_status","bcr_patient_barcode","patient_id")])
survival_data <- filter(survival_data,!is.na(days_to_last_followup)|!is.na(days_to_death)) #not both NA
survival_data <- filter(survival_data,!is.na(days_to_last_followup)|days_to_last_followup>0 &is.na(days_to_death)|days_to_death > 0 ) #ensuring positive values
survival_data <- survival_data[!duplicated(survival_data$patient_id),] #ensuring no duplicates
dim(survival_data) #should be 371
maf <- GDCquery_Maf("LIHC", pipelines = "muse")
#maf <- GDCquery_Maf("LIHC", pipelines = "somaticsniper")
#clin <- GDCquery_clinic("TCGA-LIHC","clinical")
#print(clin )
laml = read.maf(
maf,
clinicalData = clinical,
removeDuplicatedVariants = TRUE,
useAll = TRUE,
gisticAllLesionsFile = NULL,
gisticAmpGenesFile = NULL,
gisticDelGenesFile = NULL,
gisticScoresFile = NULL,
cnLevel = "all",
cnTable = NULL,
isTCGA = TRUE,
vc_nonSyn = NULL,
verbose = TRUE
)
You should have: a) loaded with library(maftools) and b) included what was printed out before that error message:
-Validating
-Silent variants: 18306
-Summarizing
--Possible FLAGS among top ten genes:
TTN
MUC16
OBSCN
FLG
-Processing clinical data
Available fields in provided annotations..
[1] "bcr_patient_barcode" "additional_studies"
[3] "tissue_source_site" "patient_id"
# snipped remaining 78 column names
Notice that the first column is not named "Tumor_Sample_Barcode", so you need to follow the helpful error message directions and rename the appropriate column which appears to be the first one:
ns. After doing so I get:
-Validating
-Silent variants: 18306
-Summarizing
--Possible FLAGS among top ten genes:
TTN
MUC16
OBSCN
FLG
-Processing clinical data
-Finished in 1.911s elapsed (2.470s cpu)

Error in colnames

Could anyone help me with some little problem?
When I plot the frontier I get the following message: "Error in colnames<-(tmp, value = c("targetRisk", "targetReturn")) :
attempt to set 'colnames' on an object with less than two dimensions"(see below for detail). How could I solve this. Thanks a lot.
Portfolio construction & Optimisation
Assets: LUTAX, PFODX,BRGAX,GFAFX,NMSAX,EGINX,IPOYX,SCWFX,FGLDX,PAGEX
Getting monthly returns of the assets
library(quantmod)
library(tseries)
library(timeSeries)
LUTAX <- monthlyReturn((getSymbols("LUTAX",auto.assign=FALSE)[,4]),type = "arithmetic")
colnames(LUTAX) <- c("LUTAX")
PFODX <- monthlyReturn((getSymbols("PFODX",auto.assign=FALSE)[,4]),type = "arithmetic")
colnames(PFODX) <- c("PFODX")
BRGAX <- monthlyReturn((getSymbols("BRGAX",auto.assign=FALSE)[,4]),type = "arithmetic")
colnames(BRGAX) <- c("BRGAX")
GFAFX <- monthlyReturn((getSymbols("GFAFX",auto.assign=FALSE)[,4]),type = "arithmetic")
colnames(GFAFX) <- c("GFAFX")
NMSAX <- monthlyReturn((getSymbols("NMSAX",auto.assign=FALSE)[,4]),type = "arithmetic")
colnames(NMSAX) <- c("NMSAX")
EGINX <- monthlyReturn((getSymbols("EGINX",auto.assign=FALSE)[,4]),type = "arithmetic")
colnames(EGINX) <- c("EGINX")
IPOYX <- monthlyReturn((getSymbols("IPOYX",auto.assign=FALSE)[,4]),type = "arithmetic")
colnames(IPOYX) <- c("IPOYX")
SCWFX <- monthlyReturn((getSymbols("SCWFX",auto.assign=FALSE)[,4]),type = "arithmetic")
colnames(SCWFX) <- c("SCWFX")
FGLDX <- monthlyReturn((getSymbols("FGLDX",auto.assign=FALSE)[,4]),type = "arithmetic")
colnames(FGLDX) <- c("FGLDX")
PAGEX <- monthlyReturn((getSymbols("PAGEX",auto.assign=FALSE)[,4]),type = "arithmetic")
colnames(PAGEX) <- c("PAGEX")
Merging returns of the assets (excluding NA's)
portfolio_returns <- merge(LUTAX, PFODX,BRGAX,GFAFX,NMSAX,EGINX,IPOYX,SCWFX,FGLDX,PAGEX,all=F)
data <- as.timeSeries(portfolio_returns)
Optimisation portfolio
library(fPortfolio)
spec <- portfolioSpec()
setNFrontierPoints <- 25
setSolver(spec) <- "solveRquadprog"
constraints <- c("minW[1:1]=0.12","maxW[1:1]=0.18","minW[2:2]=0.12","maxW[2:2]=0.18",
"minW[3:3]=0.10","maxW[3:3]=0.15","minW[4:4]=0.08","maxW[4:4]=0.12",
"minW[5:5]=0.08","maxW[5:5]=0.12","minW[6:6]=0.05","maxW[6:6]=0.10",
"minW[7:7]=0.05","maxW[7:7]=0.10","minW[8:8]=0.08","maxW[8:8]=0.12",
"minW[9:9]=0.05","maxW[9:9]=0.10","minW[10:10]=0.08","maxW[10:10]=0.12",
"minsumW[c(1:1,2:2)]=0.27","maxsumW[c(1:1,2:2)]=0.33",
"minsumW[c(3:3,4:4,6:6,10:10)]=0.37","maxsumW[c(3:3,4:4,6:6,10:10)]=0.43",
"minsumW[c(5:5,7:7,8:8,9:9)]=0.27","maxsumW[c(5:5,7:7,8:8,9:9)]=0.33",
"maxsumW[c(1:1,2:2,3:3,4:4,5:5,6:6,7:7,8:8,9:9,10:10)]=1")
portfolioConstraints(data,spec,constraints)
frontier<- portfolioFrontier(data,spec,constraints)
print(frontier)
tailoredFrontierPlot(frontier)
After running the last command above I get the following message: "Error in colnames<-(tmp, value = c("targetRisk", "targetReturn")) :
attempt to set 'colnames' on an object with less than two dimensions"

Time series with MODISTools

I need to get full EVI time series, along with dates and a quality information. After executing MODISSubsets() the crude data is available, but not processed in a comparably nice way as MODISSummaries() would do.
MODISSummaries() however reduces the time series to summary statistics, taking into account quality information.
Is there a way to extract time series for each tile from the crude data (see data frame crude below)? It would be great if that this could return a list of data frames, where each data frame represents one tile and holds data for EVI (or whatever variable), its date, and a quality flag.
Specifically, after doing the following ...
savedir <- './'
modis.subset <- data.frame(
lat = 11.3175,
long = 47.1167,
end.date = "2016-09-29"
)
MODISSubsets(
LoadDat = modis.subset,
Products = "MOD13Q1",
Bands = c("250m_16_days_EVI", "250m_16_days_pixel_reliability"),
Size = c(1,1),
StartDate = FALSE,
SaveDir = savedir,
TimeSeriesLength = 3
)
crude <- read.csv("./Lat47.11670Lon11.31750Start2013-01-01End2016-09-29___MOD13Q1.asc", header = FALSE, as.is = TRUE)
... how would you get to something like
nice <- list( lonX1_latY1=data.frame( date=..., var=..., qual=... ), lonX2_latX2=... )
...?
In short, I was missing that ExtractTile() would be usable with the return value of MODISTimeSeries(). My workaround is based on using ExtractTile() in combination with the output of reading the ASCII file. Here is what I got working for my purpose, returning a list that contains a matrix (npixels_lon, npixels_lat, n_timesteps) containing all the downloaded MODIS data, in this case EVI; a matrix of identical dimensions containing the pixel reliability code; and a vector of length n_timesteps holding the centre pixel information if its quality flag is 0 or the mean of its surrounding pixels otherwise:
read_crude_modis <- function( filn, savedir, expand_x, expand_y ){
# arguments:
# filn: file name of ASCII file holding MODIS "crude" data
# savedir: directory, where to look for that file
# expand_x : number of pixels to the right and left of centre
# expand_y : number of pixels to the top and bottom of centre
# MODIS quality flags:
# -1 Fill/No Data Not Processed
# 0 Good Data Use with confidence
# 1 Marginal data Useful, but look at other QA information
# 2 Snow/Ice Target covered with snow/ice
# 3 Cloudy Target not visible, covered with cloud
library( MODISTools )
ScaleFactor <- 0.0001 # applied to output variable
ndayyear <- 365
## Read dowloaded ASCII file
crude <- read.csv( paste( savedir, filn, sep="" ), header = FALSE, as.is = TRUE )
crude <- rename( crude, c( "V1"="nrows", "V2"="ncols", "V3"="modislon_ll", "V4"="modislat_ll", "V5"="dxy_m", "V6"="id", "V7"="MODISprod", "V8"="yeardoy", "V9"="coord", "V10"="MODISprocessdatetime" ) )
## this is just read to get length of time series and dates
tseries <- MODISTimeSeries( savedir, Band = "250m_16_days_EVI" )
ntsteps <- dim(tseries[[1]])[1]
tmp <- rownames( tseries[[1]] )
time <- data.frame( yr=as.numeric( substr( tmp, start=2, stop=5 )), doy=as.numeric( substr( tmp, start=6, stop=8 )) )
time$dates <- as.POSIXlt( as.Date( paste( as.character(time$yr), "-01-01", sep="" ) ) + time$doy - 1 )
time$yr_dec<- time$yr + ( time$doy - 1 ) / ndayyear
## get number of products for which data is in ascii file (not used)
nprod <- dim(crude)[1] / ntsteps
if ((dim(crude)[1]/nprod)!=ntsteps) { print("problem") }
## re-arrange data
if ( dim(crude)[2]==11 && expand_x==0 && expand_y==0 ){
## only one pixel downloaded
nice_all <- as.matrix( crude$V11[1:ntsteps], dim(1,1,ntsteps) ) * ScaleFactor ## EVI data
nice_qual_flg <- as.matrix( crude$V11[(ntsteps+1):(2*ntsteps)], dim(1,1,ntsteps) ) ## pixel reliability data
} else if ( dim(crude)[2]>11 ){
## multiple pixels downloaded
# nice <- ExtractTile( Data = tseries, Rows = c(crude$nrows,expand_y), Cols = c(crude$ncols,expand_x), Grid = TRUE ) ## > is not working: applying ExtractTile to return of MODISTimeSeries
nice_all <- ExtractTile( Data = crude[1:ntsteps,11:dim(crude)[2]] * ScaleFactor, Rows = c(crude$nrows[1],expand_y), Cols = c(crude$ncols[1],expand_x), Grid = TRUE )
nice_qual_flg <- ExtractTile( Data = crude[(ntsteps+1):(2*ntsteps),11:dim(crude)[2]], Rows = c(crude$nrows[1],expand_y), Cols = c(crude$ncols[1],expand_x), Grid = TRUE )
} else {
print( "Not sufficient data downloaded. Adjust expand_x and expand_y.")
}
## Clean data for centre pixel: in case quality flag is not '0', use mean of all 8 surrounding pixels
if ( expand_x==1 && expand_y==1 ){
nice_centre <- nice_all[2,2,]
nice_centre[ which( nice_qual_flg[2,2,]!=0 ) ] <- apply( nice_all[,,which( nice_qual_flg[2,2,]!=0 )], c(3), FUN=mean)
}
modis <- list( nice_all=nice_all, nice_centre=nice_centre, nice_qual_flg=nice_qual_flg, time=time )
return( modis )
}

Getting error while creating R markdown PDF report

I am getting an error while creating a PDF report out of R markdown file. Below is the snippet of the error:
Error in --dayBikeData <- read.csv("D:\\Madhav\\Study\\MSIS\\PredictiveLearning\\Week-1\\Homework\\Bike-Sharing-Dataset\\day.csv") :
object 'dayBikeData' not found
Calls: <Anonymous> ... handle -> withCallingHandlers -> withVisible -> eval -> eval
Execution halted
I have this object -dayBikeData in the session but still it is giving the error don't know how to proceed on this.
Code for fetching the data from the csv file:
```{r}
dayBikeData <- read.csv("D:\\Madhav\\Study\\MSIS\\PredictiveLearning
\\Week-1\\Homework\\Bike-Sharing-Dataset\\day.csv")
# Performs each of the operation asked in the question
basicOperations <- function(inputData){
lenData <- length(inputData)
avg <- round(mean(inputData, na.rm = TRUE), digits = 2) # mean calculation
standardDeviation <- round(sd(inputData), digits = 2) # Standard deviation
sem <- round(standardDeviation/sqrt(lenData), digits = 2)
# Formula for CI is mean - error where error is
error = round(qnorm(0.975)*standardDeviation/sqrt(lenData), digits = 2)
lower_ci <- avg - error
upper_ci <- avg + error
# resultList <- list(obs = lenData, mean = avg, standarDeviation = sd,
# standardMeanError= sem, lowerCI = lower_ci, upperCI = upper_ci
resultList <- c(lenData, avg, standardDeviation, sem,lower_ci,upper_ci)
print(resultList)
}
#Calculations for the Year Wise Data
# dData2011 <- dayBikeData[dayBikeData$yr==0,]
# dData2012 <- dayBikeData[dayBikeData$yr==1,]
dData2011ResultSet <- basicOperations(dayBikeData[dayBikeData$yr==0,]$cnt)
dData2012ResultSet <- basicOperations(dayBikeData[dayBikeData$yr==1,]$cnt)
#Calculations for the Holiday Wise Data
# dDataHoliady_0 <- dayBikeData[dayBikeData$holiday ==0,]
# dDataHoliady_1 <- dayBikeData[dayBikeData$holiday ==1,]
dDataHoliady0ResultSet <- basicOperations(dayBikeData[dayBikeData$holiday ==0,]$cnt)
dDataHoliady1ResultSet <- basicOperations(dayBikeData[dayBikeData$holiday ==1,]$cnt)
#Calculations for the WorkingDay Wise Data
# dDataWorkingDay_0 <- dayBikeData[dayBikeData$workingday ==0,]
# dDataWorkingDay_1 <- dayBikeData[dayBikeData$workingday ==1,]
dDataWorkingDay0ResultSet <- basicOperations(dayBikeData[dayBikeData$workingday ==0,]$cnt)
dDataWorkingDay1ResultSet <- basicOperations(dayBikeData[dayBikeData$workingday ==1,]$cnt)
#Calculations for the Temperature wise data
avgTemp <- mean(dayBikeData$temp, na.rm = TRUE)
dDataTempGreaterEq <- dayBikeData[dayBikeData$temp >= avgTemp,]
dDataTempLess <- dayBikeData[dayBikeData$temp < avgTemp,]
dDataTempGreaterEqResultSet <- basicOperations(dDataTempGreaterEq$cnt)
dDataTempLessResultSet <- basicOperations(dDataTempLess$cnt)
#Calculations for the Weather wise data
# dDataWeather_1 <- dayBikeData[dayBikeData$weathersit ==1,]
# dDataWeather_2 <- dayBikeData[dayBikeData$weathersit ==2,]
# dDataWeather_3 <- dayBikeData[dayBikeData$weathersit ==3,]
dDataWeather1ResultSet <- basicOperations(dayBikeData[dayBikeData$weathersit ==1,]$cnt)
dDataWeather2ResultSet <- basicOperations(dayBikeData[dayBikeData$weathersit ==2,]$cnt)
dDataWeather3ResultSet <- basicOperations(dayBikeData[dayBikeData$weathersit ==3,]$cnt)
#Calculations for the Season wise data
# dDataSeason_1 <- dayBikeData[dayBikeData$season ==1,]
# dDataSeason_2 <- dayBikeData[dayBikeData$season ==2,]
# dDataSeason_3 <- dayBikeData[dayBikeData$season ==3,]
# dDataSeason_4 <- dayBikeData[dayBikeData$season ==4,]
dDataSeason1ResultSet <- basicOperations(dayBikeData[dayBikeData$season ==1,]$cnt)
dDataSeason2ResultSet <- basicOperations(dayBikeData[dayBikeData$season ==2,]$cnt)
dDataSeason3ResultSet <- basicOperations(dayBikeData[dayBikeData$season ==3,]$cnt)
dDataSeason4ResultSet <- basicOperations(dayBikeData[dayBikeData$season ==4,]$cnt)
#Constrcut a row wise data
resultData <- rbind(dData2011ResultSet, dData2012ResultSet, dDataHoliady0ResultSet,
dDataHoliady1ResultSet,dDataWorkingDay0ResultSet,
dDataWorkingDay1ResultSet,dDataTempGreaterEqResultSet,
dDataTempLessResultSet, dDataWeather1ResultSet,
dDataWeather2ResultSet, dDataWeather3ResultSet,dDataSeason1ResultSet,
dDataSeason2ResultSet, dDataSeason3ResultSet,dDataSeason4ResultSet)
colnames(resultData) <- c("N","Mean","SD" , "SEM","Lower_CI", "UPPER_CI")
rownames(resultData) <- c("Year-0", "Year-1", "Holiday-0", "Holiday-1", "WorkingDay-0",
"WorkingDay-1","Temperature >=","Temperature <", "Weather-1",
"Weather-2","Weather-3","Season-1","Season-2", "Season-3",
"Season-4")
df.resultData <- as.data.frame(resultData)
df.resultData["Value"] <- NA
df.resultData$Value <- c(2011, 2012, 0,1, 0,1,1, 0, 1,2,3,1,2,3,4)
df.resultData = df.resultData[,c(7,1,2,3,4,5,6)]
library(knitr)
# print(xtable(df.resultData), type = "latex")
kable(df.resultData, format = "markdown")
write.csv(df.resultData, file = "D:\\X\\Study\\MSIS\\PredictiveLearning\\OutputResult.csv")
Your file path is wrong... There is a new line and lots of spaces in the middle of it.
> "D:\\Madhav\\Study\\MSIS\\PredictiveLearning
+ \\Week-1\\Homework\\Bike-Sharing-Dataset\\day.csv"
[1] "D:\\Madhav\\Study\\MSIS\\PredictiveLearning\n \\Week-1\\Homework\\Bike-Sharing-Dataset\\day.csv"
So the file is not getting read properly and hence the object is not available in the knitr session.
I downloaded your dataset from UCI Machine Learning Repository, saved your markdown in a new folder, adjusted the filenames by deleting the paths, ran it, and it worked fine.
So I maybe your session is corrupt, or the paths are wrong, or something. Try what I did and it should work.
Proof:

How to input HDFS file into R mapreduce for processing and get the result into HDFS file

I have a question similar to the below link in stackoverflow
R+Hadoop: How to read CSV file from HDFS and execute mapreduce?
I am tring to read a file from location "/somnath/logreg_data/ds1.10.csv" in HDFS, reduce its number of columns from 10 to 5 and then write to another location "/somnath/logreg_data/reduced/ds1.10.reduced.csv" in HDFS using the below
transfer.csvfile.hdfs.to.hdfs.reduced function.
transfer.csvfile.hdfs.to.hdfs.reduced("hdfs://10.5.5.82:8020/somnath/logreg_data/ds1.10.csv", "hdfs://10.5.5.82:8020/somnath/logreg_data/reduced/ds1.10.reduced.csv", 5)
The function definition is
transfer.csvfile.hdfs.to.hdfs.reduced =
function(hdfsFilePath, hdfsWritePath, reducedCols=1) {
#local.df = data.frame()
#hdfs.get(hdfsFilePath, local.df)
#to.dfs(local.df)
#r.file <- hdfs.file(hdfsFilePath,"r")
transfer.reduced.map =
function(.,M) {
label <- M[,dim(M)[2]]
reduced.predictors <- M[,1:reducedCols]
reduced.M <- cbind(reduced.predictors, label)
keyval(
1,
as.numeric(reduced.M))
}
reduced.values =
values(
from.dfs(
mapreduce(
input = from.dfs(hdfsFilePath),
input.format = "native",
map = function(.,M) {
label <- M[,dim(M)[2]]
print(label)
reduced.predictors <- M[,1:reducedCols]
reduced.M <- cbind(reduced.predictors, label)
keyval(
1,
as.numeric(reduced.M))}
)))
write.table(reduced.values, file="/root/somnath/reduced.values.csv")
w.file <- hdfs.file(hdfsWritePath,"w")
hdfs.write(reduced.values,w.file)
#to.dfs(reduced.values)
}
But I am receiving an error
Error in file(fname, paste(if (is.read) "r" else "w", if (format$mode == :
cannot open the connection
Calls: transfer.csvfile.hdfs.to.hdfs.reduced ... make.keyval.reader -> do.call -> <Anonymous> -> file
In addition: Warning message:
In file(fname, paste(if (is.read) "r" else "w", if (format$mode == :
cannot open file 'hdfs://10.5.5.82:8020/somnath/logreg_data/ds1.10.csv': No such file or directory
Execution halted
OR
When I am trying to load a file from hdfs using the below commands, I am getting the below error:
> x <- hdfs.file(path="hdfs://10.5.5.82:8020/somnath/logreg_data/ds1.10.csv",mode="r")
Error in hdfs.file(path = "hdfs://10.5.5.82:8020/somnath/logreg_data/ds1.10.csv", :
attempt to apply non-function
Any help will be highly appreciated
Thanks
Basically found a solution to the problem that I stated above.
r.file <- hdfs.file(hdfsFilePath,"r")
from.dfs(
mapreduce(
input = as.matrix(hdfs.read.text.file(r.file)),
input.format = "csv",
map = ...
))
Below is the entire modified function:
transfer.csvfile.hdfs.to.hdfs.reduced =
function(hdfsFilePath, hdfsWritePath, reducedCols=1) {
hdfs.init()
#local.df = data.frame()
#hdfs.get(hdfsFilePath, local.df)
#to.dfs(local.df)
r.file <- hdfs.file(hdfsFilePath,"r")
transfer.reduced.map =
function(.,M) {
numRows <- length(M)
M.vec.elems <-unlist(lapply(M,
function(x) strsplit(x, ",")))
M.matrix <- matrix(M.vec.elems, nrow=numRows, byrow=TRUE)
label <- M.matrix[,dim(M.matrix)[2]]
reduced.predictors <- M.matrix[,1:reducedCols]
reduced.M <- cbind(reduced.predictors, label)
keyval(
1,
as.numeric(reduced.M))
}
reduced.values =
values(
from.dfs(
mapreduce(
input = as.matrix(hdfs.read.text.file(r.file)),
input.format = "csv",
map = function(.,M) {
numRows <- length(M)
M.vec.elems <-unlist(lapply(M,
function(x) strsplit(x, ",")))
M.matrix <- matrix(M.vec.elems, nrow=numRows, byrow=TRUE)
label <- M.matrix[,dim(M.matrix)[2]]
reduced.predictors <- M.matrix[,1:reducedCols]
reduced.M <- cbind(reduced.predictors, label)
keyval(
1,
as.numeric(reduced.M)) }
)))
write.table(reduced.values, file="/root/somnath/reduced.values.csv")
w.file <- hdfs.file(hdfsWritePath,"w")
hdfs.write(reduced.values,w.file)
hdfs.close(r.file)
hdfs.close(w.file)
#to.dfs(reduced.values)
}
Hope this helps and don't forget to give points if you find it useful. Thanks ahead

Resources