Changing the values in txt file using R r - r

I want to write a softcode that will change the values of required parameters of my Hydrological model input txt file. some parametres are fixed and some I will change to the observed values. For exampple HYDRUS_Version=4 is fixed and WaterFlow=1 and SoluteTransport=0 etc are the parametres that I want to change their values. I want to assign the values of waterFlow=5 or WaterFlow=3.1 and SoluteTransport=2 or =2.2
I tried this code for water Flow but the values in my txt files are not changed. I am just new learner of R.
lines <- readLines("G:/Rlearning/HYDRUS1D.txt")
library(gsubfn)
i1 <- grepl("Vertical Conductivity", lines)
lines[i1] <- gsubfn("[0-9.]+", ~format(as.numeric(x)*2,
scientific = FALSE), lines[i1])
Below is the complete txt file.
;
[Main]
HYDRUS_Version=4
WaterFlow=1
SoluteTransport=0
Unsatchem=0
Unsatchem=0
HP1=0
HeatTransport=0
EquilibriumAdsorption=1
MobileImmobile=0
RootWaterUptake=1
RootGrowth=0
MaterialNumbers=1
SubregionNumbers=1
SpaceUnit=cm
TimeUnit=days
PrintTimes=160
NumberOfSolutes=0
InitialCondition=1
;
[Profile]
NumberOfNodes=101
ProfileDepth=1.2E+02
ObservationNodes=5
GridVisible=1
SnapToGrid=1
ProfileWidth=80
LeftMargin=40
GridOrgX=0
GridOrgY=0
GridDX=5.E+00
GridDY=5.E+00

Create a custom function to search matching string and replace value:
replaceFn <- function(phrase, value, file){
line <- file[grep(phrase, file)]
value.pos <- as.numeric(gregexpr("=", line))
file[grep(phrase, file)] <- paste0(substring(line, 1, value.pos), value)
return(file)
}
dat <- replaceFn("WaterFlow", 3.1, dat)
dat <- replaceFn("SoluteTransport", 2.2, dat)
[1] ";" "[Main]" "HYDRUS_Version=4" "WaterFlow=3.1" "SoluteTransport=2.2" "Unsatchem=0"
[7] "Unsatchem=0" "HP1=0" "HeatTransport=0" "EquilibriumAdsorption=1" "MobileImmobile=0" "RootWaterUptake=1"
[13] "RootGrowth=0" "MaterialNumbers=1" "SubregionNumbers=1" "SpaceUnit=cm" "TimeUnit=days" "PrintTimes=160"
[19] "NumberOfSolutes=0" "InitialCondition=1" ";" "[Profile]" "NumberOfNodes=101" "ProfileDepth=1.2E+02"
[25] "ObservationNodes=5" "GridVisible=1" "SnapToGrid=1" "ProfileWidth=80" "LeftMargin=40" "GridOrgX=0"
[31] "GridOrgY=0" "GridDX=5.E+00" "GridDY=5.E+00"
Input data:
dat <- c(";", "[Main]", "HYDRUS_Version=4", "WaterFlow=1", "SoluteTransport=0",
"Unsatchem=0", "Unsatchem=0", "HP1=0", "HeatTransport=0", "EquilibriumAdsorption=1",
"MobileImmobile=0", "RootWaterUptake=1", "RootGrowth=0", "MaterialNumbers=1",
"SubregionNumbers=1", "SpaceUnit=cm", "TimeUnit=days", "PrintTimes=160",
"NumberOfSolutes=0", "InitialCondition=1", ";", "[Profile]",
"NumberOfNodes=101", "ProfileDepth=1.2E+02", "ObservationNodes=5",
"GridVisible=1", "SnapToGrid=1", "ProfileWidth=80", "LeftMargin=40",
"GridOrgX=0", "GridOrgY=0", "GridDX=5.E+00", "GridDY=5.E+00")

Related

Extraction of data from HDF at different pressure level

I am trying to extract a variable named Data Fields/OzoneTropColumn at a point location (lon=40, lat=34) at different pressure level (825.40198, 681.29102, 464.16000, 316.22699 hPa) from multiple hdf files
library(raster)
library(ncdf4)
library(RNetCDF)
# read file
nc <- nc_open("E:/Ozone/test1.nc")
list_col1 <- as.list(list.files("E:/Ozone/", pattern = "*.hdf",
full.names = TRUE))
> attributes(nc$var) #using a single hdf file to check its variables
$names
[1] "Data Fields/Latitude" "Data Fields/Longitude"
[3] "Data Fields/O3" "Data Fields/O3DataCount"
[5] "Data Fields/O3Maximum" "Data Fields/O3Minimum"
[7] "Data Fields/O3StdDeviation" "Data Fields/OzoneTropColumn"
[9] "Data Fields/Pressure" "Data Fields/TotColDensDataCount"
[11] "Data Fields/TotColDensMaximum" "Data Fields/TotColDensMinimum"
[13] "Data Fields/TotColDensStdDeviation" "Data Fields/TotalColumnDensity"
[15] "HDFEOS INFORMATION/StructMetadata.0" "HDFEOS INFORMATION/coremetadata"
> pres <- ncvar_get(nc, "Data Fields/Pressure") #looking at pressure level from single file of hdf
> pres
[1] 825.40198 681.29102 464.16000 316.22699 215.44400 146.77901 100.00000 68.12950 46.41580 31.62290
[11] 21.54430 14.67800 10.00000 6.81291 4.64160
ncin <- raster::stack(list_col1,
varname = "Data Fields/OzoneTropColumn",
ncdf=TRUE)
#cannot extract using the following code
o3 <- ncvar_get(list_col1,attributes(list_col1$var)$names[9])
"Error in ncvar_get(list_col1, attributes(list_col1$var)$names[9]) :
first argument (nc) is not of class ncdf4!"
#tried to extract pressure levels
> prsr <- raster::stack(list_col1,varname = "Data Fields/Pressure",ncdf=TRUE)
"Error in h(simpleError(msg, call)) :
error in evaluating the argument 'x' in selecting a method for function 'stack': varname: Data Fields/Pressure does not exist in the file. Select one from:
Data Fields/O3, Data Fields/O3DataCount, Data Fields/O3Maximum, Data Fields/O3Minimum, Data Fields/O3StdDeviation, Data Fields/OzoneTropColumn, Data Fields/TotColDensDataCount, Data Fields/TotColDensMaximum, Data Fields/TotColDensMinimum, Data Fields/TotColDensStdDeviation, Data Fields/TotalColumnDensity"
#tried using index
#Point location can also be written as below 1 deg by 1 deg resolution
lonIdx <- which(lon >32 & lon <36)
latIdx <- which(lat >38 & lat <42)
presIdx <- which(pres >= 400 & pres <= 900)
#also tried
# Option 2 -- subset using array indexing
o3 <- ncvar_get(list_col1,'Data Fields/OzoneTropColumn')
"Error in ncvar_get(list_col1, "Data Fields/OzoneTropColumn") :
first argument (nc) is not of class ncdf4!"
extract2 <- o3[lonIdx, latIdx, presIdx, ]
How to I extract these values vertically at each pressure level ? (SM=Some value)
I would like the output in following way at location (lon=40, lat=34):
Pressure 1 2 3 4 5 .... 10
825.40198 SM1 SM2 SM3 SM4 SM5... SM10
681.29102 SM11 SM12
464.16000
316.22699 SM.. SM.. SM.. SM.. SM.. SM..
Appreciate any help.
Thank you
This might be an issue with how netcdf4 and raster name each of the layers in the file. And perhaps some confusion with trying to create a multilayer object from multiple ncdf at once.
I would do the following, using only raster: Load a single netCDF, using stack() or brick(). This will load the file as a multilayer object in R. Use names() to identify what is the name of the Ozone layer according to the raster package.
firstraster <- stack("E:/Ozone/test1.nc")
names(firstraster)
Once you find out the name, you can just execute a reading of all objects as stack(), extract the information on points of interest, without even assembling all layers in a single stack.
Ozonelayername <- "put name here"
files <- list.files("E:/Ozone/", pattern = "*.hdf", full.names = TRUE)
stacklist <- lapply(files, stack)
Ozonelayerlist <- lapply(stacklist, "[[", "Ozonelayername")
The line above will output a list of rasters objects (not stacks or bricks, just plain rasters), with only the layer you want.
Now we just need to execute an extract on each of these layers. sapply() will format that neatly in a matrix for us.
pointsofinterest <- expand.grid(32:36,38:42)
values <- sapply(Ozonelayerlist, extract, pointsofinterest)
I can test it, since I do not have the data, but I assume this would work.

Automate "ncvar_get"-reading of different variable names in NetCDF files

I have a NetCDF dataset with two climate scenarios (rcp & hist), both of them containing 25 files. Each file either contains data for the variable "pr", "tas", "tasmax", or "tasmin". I wrote a for loop to iteratively read the files of hist and rcp, read them with nc_open, extract the variable with ncvar_get and finally make a calculation in form of mean(abs(hist - rcp) to obtain the mean absolute distance between each pair of hist and rcp. The problem: as ncvar_get requires the exact variable name of the current file I wrote an if else block (see below) that shall find the variable name of the current file and apply it for ncvar_get. Running the code I obtain the following error:
[1] "vobjtovarid4: error #F: I could not find the requsted var (or dimvar) in the file!"
[1] "var (or dimvar) name: tas"
[1] "file name: /data/historical/tasmax_ICHEC-EC-EARTH_DMI-HIRHAM5_r3i1p1.nc" Error in vobjtovarid4(nc, varid, verbose = verbose, allowdimvar = TRUE) : Variable not found
#Extract of the files in the hist list. Same file names in the rcp list, but different directory
> hist.files.cl <- list.files("/historical", full.names = TRUE)
> hist.files.cl
[1] "/historical/pr_CNRM-CERFACS-CNRM-CM5_ALADIN53_r1i1p1.nc"
[2] "/historical/pr_CNRM-CERFACS-CNRM-CM5_ALARO-0_r1i1p1.nc"
[3] "/historical/pr_ICHEC-EC-EARTH_HIRHAM5_r3i1p1.nc"
[4] "/historical/pr_ICHEC-EC-EARTH_RACMO22E_r12i1p1.nc"
[5] "/historical/pr_ICHEC-EC-EARTH_RCA4_r12i1p1.nc"
[6] "/historical/pr_MPI-M-MPI-ESM-LR_RCA4_r1i1p1.nc"
[7] "/historical/pr_MPI-M-MPI-ESM-LR_REMO2009_r1i1p1.nc"
[8] "/historical/pr_MPI-M-MPI-ESM-LR_REMO2009_r2i1p1.nc"
[9] "/historical/tas_CNRM-CERFACS-CNRM-CM5_CNRM-ALADIN53_r1i1p1.nc"
[10] "/historical/tas_CNRM-CERFACS-CNRM-CM5_RMIB-UGent-ALARO-0_r1i1p1.nc"
[11] "/historical/tas_ICHEC-EC-EARTH_DMI-HIRHAM5_r3i1p1.nc"
[12] "/historical/tas_ICHEC-EC-EARTH_KNMI-RACMO22E_r12i1p1.nc"
[13] "/historical/tas_ICHEC-EC-EARTH_SMHI-RCA4_r12i1p1.nc"
[14] "/historical/tas_MPI-M-MPI-ESM-LR_MPI-CSC-REMO2009_r1i1p1.nc"
[15] "/historical/tas_MPI-M-MPI-ESM-LR_MPI-CSC-REMO2009_r2i1p1.nc"
[16] "/historical/tasmax_ICHEC-EC-EARTH_DMI-HIRHAM5_r3i1p1.nc"
[17] "/historical/tasmax_ICHEC-EC-EARTH_KNMI-RACMO22E_r12i1p1.nc"
[18] "/historical/tasmax_ICHEC-EC-EARTH_SMHI-RCA4_r12i1p1.nc"
euc.distance <- list()
for(i in 1:length(hist.files.cl)) {
#Open ith file in list of hist files as well as in list of rcp files
hist.data <- nc_open(hist.files.cl[i])
rcp.data <- nc_open(rcp.files.cl[i])
if(grepl("pr", hist.data$filename)){
hist.var <- ncvar_get(hist.data, "pr")
rcp.var <- ncvar_get(rcp.data, "pr")
}else if (grepl("tas", hist.data$filename)){
hist.var <- ncvar_get(hist.data, "tas")
rcp.var <- ncvar_get(rcp.data, "tas")
}else if (grepl("tasmax", hist.data$filename)){
hist.var <- ncvar_get(hist.data, "tasmax")
rcp.var <- ncvar_get(rcp.data, "tasmax")
}else{
hist.var <- ncvar_get(hist.data, "tasmin")
rcp.var <- ncvar_get(rcp.data, "tasmin")
}
#Converting temperature variable from K to °C:
if(grepl("tas", hist.data$filename)){
hist.var <- hist.var-273.15
rcp.var <- rcp.var-273.15
}
#Find for the ith rcp file with dim=(1,1,360) in the ith hist file with dim=(385,373,360) the grid point with the best fitting distribution (each grid point consists of a distribution of 360 time steps).The calculation may contain errors...
euc.distance[[i]] <- apply(hist.var, c(1,2), function(x) mean(abs(rcp.var - x)))
min_values <- which(rank(euc.distance[[i]], ties.method='min') <= 10)
}
As cath highlighted the probable cause of the error, but the proposed approach to extract the part of interest (=variable name) from the filename does not work. I before tried to automate the extraction of the variable name by using stringr("filename",startposition, endposition) until I noticed that there is no sense in it, because each variable name (pr, tas, tasmax, tasmin) has another string length. What possibilities do you see for me?
Thank you a lot!
To complete a bit my comment, if you need to operate on each file, you could do it at once, putting everything in a list.
So, first get the "keypart" for each file:
keyparts <- sub("^([a-z]+)_.+", "\\1", basename(hist.files.cl))
keyparts
# [1] "pr" "pr" "pr" "pr" "pr" "pr" "pr" "pr"
# [9] "tas" "tas" "tas" "tas" "tas" "tas" "tas" "tasmax"
#[17] "tasmax" "tasmax"
Then you can use lapply to do what you need to do for every files at once:
my_res <- lapply(seq(keyparts),
function(i){
hist.data <- nc_open(hist.files.cl[i])
rcp.data <- nc_open(rcp.files.cl[i])
hist.var <- ncvar_get(hist.data, keyparts[i])
rcp.var <- ncvar_get(rcp.data, keyparts[i])
if(keyparts[i]=="tas"){
hist.var <- hist.var-273.15
rcp.var <- rcp.var-273.15
}
euc.distance <- apply(hist.var, c(1,2), function(x) mean(abs(rcp.var - x)))
min_values <- which(rank(euc.distance[[i]], ties.method='min') <= 10)
return(list(euc.distance=euc.distance, min.values=min.values))
})

How to loop combination of variables from xml-file and save in new xml-file in R

I currently have a large xml-file with a lot of variables which I want to change and export in a new xml file with the help of R.
Here is the beginning of my xml file (it stays quiet the same until the end)
<Assemblies count="4">
<Assembly index="0">
<IdentNr>2</IdentNr>
<IDNr_DB>0</IDNr_DB>
<Name>Decke D1</Name>
<Order_Layers choice="von außen nach innen">2</Order_Layers>
<Grid_Kind choice="Mittel">2</Grid_Kind>
<Layers count="5">
<Layer index="0">
<Thickness unit="m">0.003</Thickness>
<Material>
<IDNr_DB>1203</IDNr_DB>
<Name>Linoleum nach DIN 18171</Name>
<ThermalConductivity unit="W/mK">0.17</ThermalConductivity>
<BulkDensity unit="kg/m³">1000</BulkDensity>
<Porosity unit="-">0.23</Porosity>
<HeatCapacity unit="J/kgK">1500</HeatCapacity>
<WaterVaporResistance unit="-">6250</WaterVaporResistance>
<ReferenceWaterContent unit="kg/m³" />
<FreeWaterSaturation unit="kg/m³" />
<WaterAbsorptionCoefficient unit="kg/m²s^0.5" />
<MoistureSupplement unit="%/M.-%" />
<TempDepThermalCondSupplement unit="W/mK²" />
<TypicalMoisture unit="kg/m³" />
<Layer index="1">
<Thickness unit="m">0.02</Thickness>
<Material>
<IDNr_DB>1039</IDNr_DB>
<Name>Zement Fließestrich, obere Schicht</Name>
<ThermalConductivity unit="W/mK">1.6</ThermalConductivity>
<BulkDensity unit="kg/m³">1890</BulkDensity>
<Porosity unit="-">0.2</Porosity>
<HeatCapacity unit="J/kgK">850</HeatCapacity>
<WaterVaporResistance unit="-">58</WaterVaporResistance>
<ReferenceWaterContent unit="kg/m³">37.8</ReferenceWaterContent>
<FreeWaterSaturation unit="kg/m³">168</FreeWaterSaturation>
<WaterAbsorptionCoefficient unit="kg/m²s^0.5">0.025</WaterAbsorptionCoefficient>
<MoistureSupplement unit="%/M.-%" />
<TempDepThermalCondSupplement unit="W/mK²">0.0002</TempDepThermalCondSupplement>
<TypicalMoisture unit="kg/m³">168</TypicalMoi
I've already got a code with a foreach loop, but I need to change a ton of variables. Here is the example with 2 loops which exports 6 new files
library(XML)
doc <- xmlTreeParse("d:\\Users\\Documents\\raum-klima-putz\\R\\TestXML\\test.xml", getDTD = F)
r <- xmlRoot(doc)
ExpPath <- "d:\\Users\\Documents\\raum-klima-putz\\R\\TestXML3"
example.weatherfile <- c("d:\\Users\\Documents\\raum-klima-putz\\R\\WetterdatenJuni2017\\2032_Karlsruhe.epw", "d:\\Users\\Documents\\raum-klima-putz\\R\\WetterdatenJuni2017\\2032_Karlsruhe_swdirnorm.epw", "Test")
example.thickness <- c("12","20","21")
for (i in 1:length(example.weatherfile))
{
xmlValue(r[["Variants"]][[1]][["ClimateLocation"]][["FileName"]]) <- example.weatherfile[i]
xmlValue(r[["Assemblies"]][[1]][["Layers"]][[1]][["Thickness"]]) <- example.thickness[i]
FileName <- paste("Weather_neu",i, ".xml", sep="");
saveXML(r, file=paste(ExpPath, FileName, sep = "\\"), compression=0, prefix = NULL);
}
for (i in 1:length(example.thickness))
{
xmlValue(r[["Variants"]][[1]][["ClimateLocation"]][["FileName"]]) <- example.weatherfile[i]
xmlValue(r[["Assemblies"]][[1]][["Layers"]][[1]][["Thickness"]]) <- example.thickness[i]
FileName <- paste("Thickness_neu",i, ".xml", sep="");
saveXML(r, file=paste(ExpPath, FileName, sep = "\\"), compression=0, prefix = NULL);
}
but instead of creating a forsearch loop for each combination of variables I want to create sort of a matrix so I can get a combinbination of all the parameters, e.g. in case of 10 parameters: a matrix of 10x10 which gets 100 new files. So the first parameter gets replaced by 3 variables and combined with all the other parameters, then the second parameter and so on.
Is there an easier way to create these variations than to write a foreach loop for every parameter?
Thank you
This might work to get you what you are after.
library(XML)
doc <- xmlTreeParse("d:\\Users\\Documents\\raum-klima-putz\\R\\TestXML\\test.xml", getDTD = F)
r <- xmlRoot(doc)
ExpPath <- "d:\\Users\\Documents\\raum-klima-putz\\R\\TestXML3"
example.weatherfile <- c("d:\\Users\\Documents\\raum-klima-putz\\R\\WetterdatenJuni2017\\2032_Karlsruhe.epw", "d:\\Users\\Documents\\raum-klima-putz\\R\\WetterdatenJuni2017\\2032_Karlsruhe_swdirnorm.epw", "Test")
example.thickness <- c("12","20","21")
# Create dataframe with all combinations of weatherfiles and thicknesses
all.combos <- expand.grid(example.weatherfile, example.thickness)
# Add a column for row number
all.combos$rownum <- c(1:nrow(all.combos))
# Apply the custom function to all rows of the all.combos dataframe
apply(all.combos, 1, function(x){
# x is a vector of the current row being processed
# x[1] is the current weatherfile
# x[2] is the current thickness
# x[3] is the current row number
xmlValue(r[["Variants"]][[1]][["ClimateLocation"]][["FileName"]]) <- x[1]
xmlValue(r[["Assemblies"]][[1]][["Layers"]][[1]][["Thickness"]]) <- x[2]
FileName <- paste("Weather_neu",x[3], ".xml", sep="");
saveXML(r, file=paste(ExpPath, FileName, sep = "\\"), compression=0, prefix = NULL)})

Read the VW raw scores from (CS)OAA

VowpalWabbit writes raw predictions from (CS)OAA model as a sequence of lines like this:
1:-2.31425 2:-3.98557 3:-3.97967 4:-2.63708 5:-3.18749 6:-2.43984 7:-4.99018 8:-3.49138 9:-3.07816 10:-6.15126 11:-6.01152 12:-5.76039 13:-5.13096 14:-5.18472 15:-5.37358 16:-5.24147 17:-5.21512 18:-5.67961 19:-4.62929 20:-4.61404 000db8cd6aef4e5fa459126d36e0fa1f-none
1:-2.65864 2:-3.33924 3:-2.8116 4:-1.83108 5:-2.05677 6:-1.29879 7:-6.7446 8:-3.05036 9:-2.82138 10:-5.19605 11:-4.5119 12:-5.28309 13:-4.35789 14:-4.76992 15:-4.16866 16:-4.6897 17:-3.76224 18:-4.13129 19:-4.4489 20:-4.32605 000e0e58a4cb4a218bbc6cae0b1af201-none
How do I read it into R?
Here is my code:
## load raw vw (CS)OAA scores
read.vw.oaa.scores <- function (myfile) {
v <- sapply(strsplit(readLines(myfile),' ',fixed=TRUE), function (r) {
m <- matrix(unlist(strsplit(head(r,-1),':',fixed=TRUE)),ncol=2,byrow=TRUE)
stopifnot(identical(1:nrow(m),as.integer(m[,1])))
c(tail(r,1),m[,2])
})
f <- as.data.frame(t(v),stringsAsFactors=FALSE)
names(f) <- c("id",head(names(f),-1))
for (n in tail(names(f),-1))
f[[n]] <- as.numeric(f[[n]])
f
}
Are there any obvious bugs/inefficiencies?
Is there a better way?
PS. This data format looks like CRS but it is not it.
See if the following works for you (probably really slow). Assumes all desired values are in numeric:value format. And uses raw which requires each line to be stored as a character array.
raw = c("1:-2.31425 2:-3.98557 3:-3.97967 4:-2.63708 5:-3.18749 6:-2.43984 7:-4.99018 8:-3.49138 9:-3.07816 10:-6.15126 11:-6.01152 12:-5.76039 13:-5.13096 14:-5.18472 15:-5.37358 16:-5.24147 17:-5.21512 18:-5.67961 19:-4.62929 20:-4.61404 000db8cd6aef4e5fa459126d36e0fa1f-none",
"1:-2.65864 2:-3.33924 3:-2.8116 4:-1.83108 5:-2.05677 6:-1.29879 7:-6.7446 8:-3.05036 9:-2.82138 10:-5.19605 11:-4.5119 12:-5.28309 13:-4.35789 14:-4.76992 15:-4.16866 16:-4.6897 17:-3.76224 18:-4.13129 19:-4.4489 20:-4.32605 000e0e58a4cb4a218bbc6cae0b1af201-none")
Function to clean
clean = function(t, n) {as.numeric(gsub("^[0-9]+:", "", unlist(strsplit(t, split=" "))[1:n]))}
lapply(raw, clean, n = 20)
[[1]]
[1] -2.31425 -3.98557 -3.97967 -2.63708 -3.18749 -2.43984 -4.99018 -3.49138 -3.07816 -6.15126 -6.01152 -5.76039
[13] -5.13096 -5.18472 -5.37358 -5.24147 -5.21512 -5.67961 -4.62929 -4.61404
[[2]]
[1] -2.65864 -3.33924 -2.81160 -1.83108 -2.05677 -1.29879 -6.74460 -3.05036 -2.82138 -5.19605 -4.51190 -5.28309
[13] -4.35789 -4.76992 -4.16866 -4.68970 -3.76224 -4.13129 -4.44890 -4.32605

R loop for creating and using -csv

I have a function output (from koRpus) of the form:
Total number of tokens: 887
Total number of types: 393
Measure of Textual Lexical Diversity
MTLD: 142.66
Number of factors: 6.22
Factor size: 0.72
SD tokens/factor: 41.55 (all factors)
38 (complete factors only)
And I want to make a loop for storing these results for 80 different documents. I have tried the following:
for (i in 1:length(infra$tableid)) {
whypar <- paste(infra$whypar [infra[,1] ==i], collapse=" ")
wpi<- removeWords(whypar, stopwords("english"))
as.data.frame(wpi)
write.csv(data.frame(wpi), file= "wp.csv")
tagged.text <- tokenize("wp.csv", lang="en")
res.mtld <- MTLD(tagged.text)
write.csv(data.frame(res.mtld),file="output.csv")
}
where infra is:
tableid 1, 2, 3, ... 80
whypar "I took part because xxx", "I believe that jshfdjk", "jhsadkjhd" ... (N=350)
Thanks for any help
Extract the parts of the MTLD object you are interested in first. From your question it seems like you are only interested in a subset of the object returned by MTLD, namely the MTLD score, number of factors the SD of tokens/factor and the SD for complete factors only. If you only want these results for each file you can just write one nice table as your output for all the files:
res <- data.frame( ID = numeric() , MTLD=numeric() , Factor_Size=numeric() , SD=numeric() , SD_Complete=numeric() )
for (i in 1:length(infra$tableid)) {
whypar <- paste(infra$whypar [infra[,1] ==i], collapse=" ")
wpi<- removeWords(whypar, stopwords("english"))
wpi <- as.data.frame(wpi)
write.csv(data.frame(wpi), file= "wp.csv")
tagged.text <- tokenize("wp.csv", lang="en")
res.mtld <- MTLD(tagged.text)
mtld <- res.mtld#MTLD$MTLD
fac.size <- res.mtld#param$factor.size
mtld.sd <- res.mtld#MTLD$lengths$sd
mtld.sd.compl <- res.mtld#MTLD$lengths$sd.compl
res <- rbind( res , c( infra$tableid[i] , mtld, fac.size , mtld.sd , mtld.sd.compl ) )
}
write.csv( res , file="output.csv" )
I hope this helps, but check these are the results you want returned.

Resources