After inserting an apply instead of loop - r

I changed my dataset to data.table and I'm using sapply (apply family) but so far that wasn't sufficiant. Is this fully correct?
I already went from this:
library(data.table)
library(lubridate)
buying_volume_before_breakout <- list()
for (e in 1:length(df_1_30sec_5min$date_time)) {
interval <- dolar_tick_data_unified_dt[date_time <= df_1_30sec_5min$date_time[e] &
date_time >= df_1_30sec_5min$date_time[e] - time_to_collect_volume &
Type == "Buyer"]
buying_volume_before_breakout[[e]] <- sum(interval$Quantity)
}
To this (created a function and and using sapply)
fun_buying_volume_before_breakout <- function(e) {
interval <- dolar_tick_data_unified_dt[date_time <= df_1_30sec_5min$date_time[e] &
date_time >= df_1_30sec_5min$date_time[e] - time_to_collect_volume &
Type == "Buyer"]
return(sum(interval$Quantity))
}
buying_volume_before_breakout <- sapply(1:length(df_1_30sec_5min$date_time), fun_buying_volume_before_breakout)
I couldn't make my data reproducible but here are some more insights about its structure.
> str(dolar_tick_data_unified_dt)
Classes ‘data.table’ and 'data.frame': 3120650 obs. of 6 variables:
$ date_time : POSIXct, format: "2017-06-02 09:00:35" "2017-06-02 09:00:35" "2017-06-02 09:00:35" ...
$ Buyer_from : Factor w/ 74 levels "- - ","- - BGC LIQUIDEZ DTVM",..: 29 44 19 44 44 44 44 17 17 17 ...
$ Price : num 3271 3271 3272 3271 3271 ...
$ Quantity : num 5 5 5 5 5 5 10 5 50 25 ...
$ Seller_from: Factor w/ 73 levels "- - ","- - BGC LIQUIDEZ DTVM",..: 34 34 42 28 28 28 28 34 45 28 ...
$ Type : Factor w/ 4 levels "Buyer","Direct",..: 1 3 1 1 1 1 1 3 3 3 ...
- attr(*, ".internal.selfref")=<externalptr>
> str(df_1_30sec_5min)
Classes ‘data.table’ and 'data.frame': 3001 obs. of 13 variables:
$ date_time : POSIXct, format: "2017-06-02 09:33:30" "2017-06-02 09:49:38" "2017-06-02 10:00:41" ...
$ Price : num 3251 3252 3256 3256 3260 ...
$ fast_small_mm : num 3250 3253 3254 3256 3259 ...
$ slow_small_mm : num 3254 3253 3254 3256 3259 ...
$ fast_big_mm : num 3255 3256 3256 3256 3258 ...
$ slow_big_mm : num 3258 3259 3260 3261 3262 ...
$ breakout_strength : num 6.5 2 0.5 2 2.5 0.5 1 2.5 1 0.5 ...
$ buying_volume_before_breakout: num 1285 485 680 985 820 ...
$ total_volume_before_breakout : num 1285 485 680 985 820 ...
$ average_buying_volume : num 1158 338 318 394 273 ...
$ average_total_volume : num 1158 338 318 394 273 ...
$ relative_strenght : num 1 1 1 1 1 1 1 1 1 1 ...
$ relative_strenght_last_6min : num 1 1 1 1 1 1 1 1 1 1 ...
- attr(*, ".internal.selfref")=<externalptr>

First, separate the 'buyer' data from the rest. Then add a column for the start of the time interval and do a non-equi join in data.table, which is what #chinsoon is suggesting. I've made a reproducible example below:
library(data.table)
set.seed(123)
N <- 1e5
# Filter buyer details first
buyer_dt <- data.table(
tm = Sys.time()+runif(N,-1e6,+1e6),
quantity=round(runif(N,1,20))
)
time_dt <- data.table(
t = seq(
min(buyer_dt$tm),
max(buyer_dt$tm),
by = 15*60
)
)
t_int <- 300
time_dt[,t1:=t-t_int]
library(rbenchmark)
benchmark(
a={ # Your sapply code
bv1 <- sapply(1:nrow(time_dt), function(i){
buyer_dt[between(tm,time_dt$t[i]-t_int,time_dt$t[i]),sum(quantity)]
})
},
b={ # data.table non-equi join
all_intervals <- buyer_dt[time_dt,.(t,quantity),on=.(tm>=t1,tm<=t)]
bv2 <- all_intervals[,sum(quantity),by=.(t)]
}
,replications = 9
)
#> test replications elapsed relative user.self sys.self user.child
#> 1 a 9 42.75 158.333 81.284 0.276 0
#> 2 b 9 0.27 1.000 0.475 0.000 0
#> sys.child
#> 1 0
#> 2 0
Edit: In general, any join of two tables A and B is a subset of the outer join [A x B]. The rows of [A x B] will have all possible combinations of the rows of A and the rows of B. An equi join will subset [A x B] by checking equality conditions, i.e. If x and y are the join columns in A and B, Your join will be : rows from [A x B] where A.x=B.x and A.y=B.y
In a NON-equi join, the subset condition will have comparision operators OTHER than =, for example: like your case, where you want columns such that A.x <= B.x <= A.x + delta.
I don't know much about how they are implemented, but data.table has a pretty fast one that has worked well for me with large data frames.

Related

R: Directly "Feed" Results into an IF STATEMENT

I am working with the R programming language. I have the following example - there are two data frames (height_quantiles and test):
> height_quantiles
salary_type quant_80
1 A 3.752192
2 B 3.713571
3 C 4.117180
> str(height_quantiles)
'data.frame': 3 obs. of 2 variables:
$ salary_type: Factor w/ 3 levels "A","B","C": 1 2 3
$ quant_80 : Named num 3.75 3.71 4.12
..- attr(*, "names")= chr [1:3] "80%" "80%" "80%"
and
> head(test)
salary height salary_type
701 1.358904 1.6148796 A
702 -2.702212 1.0604070 A
703 1.534527 -4.0957218 A
704 5.594247 5.7373110 B
705 -1.823547 5.5808484 A
706 7.949913 -0.2021635 C
str(test)
'data.frame': 300 obs. of 3 variables:
$ salary : num 1.36 -2.7 1.53 5.59 -1.82 ...
$ height : num 1.61 1.06 -4.1 5.74 5.58 ...
$ salary_type: Factor w/ 3 levels "A","B","C": 1 1 1 2 1 3 2 1 2 3 ...
I am trying to write the following code :
test$height_pred = as.numeric(ifelse(test$salary_type == "A", height_quantiles[1,1], ifelse(test$salary_type == "B", height_quantiles[2,1], height_quantiles[3,1])))
But this returning values of "test$height_pred " as "1,2,3" . But I would like it to return values corresponding to the height_quantiles frame such as "3.75, 3.71 , 4.12".
Can someone please show me how to do this?
Thanks
You need to extract data from the second column i.e height_quantiles[1,2], height_quantiles[2,2] etc. Right now, you are doing it from the first column.
Also a better approach would be to use a join or match.
test$height_pred <- height_quantiles$quant_80[match(test$salary_type, height_quantiles$salary_type)]
Or
merge(test, height_quantiles)

Calculation of Geometric Mean of Data that includes NAs

EDIT: The problem was not within the geoMean function, but with a wrong use of aggregate(), as explained in the comments
I am trying to calculate the geometric mean of multiple measurements for several different species, which includes NAs. An example of my data looks like this:
species <- c("Ae", "Ae", "Ae", "Be", "Be")
phen <- c(2, NA, 3, 1, 2)
hveg <- c(NA, 15, 12, 60, 59)
df <- data.frame(species, phen, hveg)
When I try to calculate the geometric mean for the species Ae with the built-in function geoMean from the package EnvStats like this
library("EnvStats")
aggregate(df[, 3:3], list(df1$Sp), geoMean, na.rm=TRUE)
it works wonderful and skips the NAs to give me the geometric means per species.
Group.1 phen hveg
1 Ae 4.238536 50.555696
2 Be 1.414214 1.414214
When I do this with my large dataset, however, the function stumbles over NAs and returns NA as result even though there are e.g 10 numerical values and only one NA. This happens for example with the column SLA_mm2/mg.
My large data set looks like this:
> str(cut2trait1)
Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 22 obs. of 19 variables:
$ Cut : chr "15_08" "15_08" "15_08" "15_08" ...
$ Block : num 1 1 1 1 1 1 1 1 1 1 ...
$ ID : num 451 512 431 531 591 432 551 393 511 452 ...
$ Plot : chr "1_1" "1_1" "1_1" "1_1" ...
$ Grazing : chr "n" "n" "n" "n" ...
$ Acro : chr "Leuc.vulg" "Dact.glom" "Cirs.arve" "Trif.prat" ...
$ Sp : chr "Lv" "Dg" "Ca" "Tp" ...
$ Label_neu : chr "Lv021" "Dg022" "Ca021" "Tp021" ...
$ PlantFunctionalType: chr "forb" "grass" "forb" "forb" ...
$ PlotClimate : chr "AC" "AC" "AC" "AC" ...
$ Season : chr "Aug" "Aug" "Aug" "Aug" ...
$ Year : num 2015 2015 2015 2015 2015 ...
$ Tiller : num 6 3 3 5 6 8 5 2 1 7 ...
$ Hveg : num 25 38 70 36 68 65 23 58 71 27 ...
$ Hrep : num 39 54 77 38 76 70 65 88 98 38 ...
$ Phen : num 8 8 7 8 8 7 6.5 8 8 8 ...
$ SPAD : num 40.7 42.4 48.7 43 31.3 ...
$ TDW_in_g : num 4.62 4.85 11.86 5.82 8.99 ...
$ SLA_mm2/mg : num 19.6 19.8 20.3 21.2 21.7 ...
and the result of my code
gm_cut2trait1 <- aggregate(cut2trait1[, 13:19], list(cut2trait1$Sp), geoMean, na.rm=TRUE)
is (only the first two rows):
Group.1 Tiller Hveg Hrep Phen SPAD TDW_in_g SLA_mm2/mg
1 Ae 13.521721 73.43485 106.67933 NA 28.17698 1.2602475 NA
2 Be 8.944272 43.95452 72.31182 5.477226 20.08880 0.7266361 9.309672
Here, the geometric mean of SLA for Ae is NA, even though there are 9 numeric measurements and only one NA in the column used to calculate the geometric mean.
I tried to use the geometric mean function suggested here:
Geometric Mean: is there a built-in?
But instead of NAs, this returned the value 1.000 when used with my big dataset, which doesn't solve my problem.
So my question is: What is the difference between my example df and the big dataset that throws the geoMean function off the rails?

R Standardizing numeric variables in dataframe while retaining factor variables

I have a dataframe (dcc) loaded in R which I have narrowed down to complete cases.
str(dcc)
'data.frame': 41715 obs. of 9 variables:
$ XCoord : num 661382 661412 661442 661472 661502 ...
$ YCoord : num 648092 648092 648092 648092 648092 ...
$ OBJECTID : int 1 2 3 4 5 6 7 8 9 10 ...
$ POINTID : int 1 2 3 4 5 6 7 8 9 10 ...
$ GRID_CODE : int 0 0 0 0 0 0 0 0 0 0 ...
$ APPL_COST_DIST_RIV_COAST: num 21350 21674 22185 22748 23448 ...
$ APPL_DEM30 : int 785 793 792 769 765 777 784 789 781 751 ...
$ APPL_DEM30_SLOPE : num 19.7 13.3 18.6 23.2 21 ...
$ APPL_SITE_NONSITE : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
I want to standardize the numeric and integer variables by subtracting the mean and dividing by the standard deviation. When I apply the following code, I inadvertently drop the factor variable APPL_SITE_NONSITE from the dataframe:
ind <- sapply(dcc, is.numeric)
dcc.s<-sapply(dcc[,ind], function(x) (x-mean(x))/sd(x))
dcc.s<-data.frame(dcc.s)
If I'm not mistaken, that happens because ind=FALSE for that variable. It seems like I need some combination of a for loop and if/else statement to standardize the numeric variables and leave the factor variable alone. I have tried a number of permutations, but keep getting errors. For example, the following code:
dcc.s <- for (i in 1:ncol(dcc)){ sapply(dcc[,i],
if (is.numeric(dcc[,i])==TRUE) {
function(x) (x-mean(x))/sd(x) }
else {dcc[,i]})
}
returns the error:
Error in match.fun(FUN) :
c("'if (is.numeric(dcc[, i]) == TRUE) {' is not a function, character or symbol", "' function(x) (x - mean(x))/sd(x)' is not a function, character or symbol", "'} else {' is not a function, character or symbol", "' dcc[, i]' is not a function, character or symbol", "'}' is not a function, character or symbol")
Perhaps this is a simple formatting error or misplaced bracket, but I'm thoroughly stuck. I am open to other approaches if there is an more elegant way to do this. Any help would be much appreciated.
You need to use rapply instead of sapply
set.seed(1)
> df=data.frame(A=rnorm(10),b=1:10,C=as.factor(rep(1:2,5)))
> str(df)
'data.frame': 10 obs. of 3 variables:
$ A: num -0.626 0.184 -0.836 1.595 0.33 ...
$ b: int 1 2 3 4 5 6 7 8 9 10
$ C: Factor w/ 2 levels "1","2": 1 2 1 2 1 2 1 2 1 2
The code you need to use:
> D=rapply(df,scale,c("numeric","integer"),how="replace")
> D
A b C
1 -0.97190653 -1.4863011 1
2 0.06589991 -1.1560120 2
3 -1.23987805 -0.8257228 1
4 1.87433300 -0.4954337 2
5 0.25276523 -0.1651446 1
6 -1.22045645 0.1651446 2
7 0.45507643 0.4954337 1
8 0.77649606 0.8257228 2
9 0.56826358 1.1560120 1
10 -0.56059319 1.4863011 2
> str(D)
'data.frame': 10 obs. of 3 variables:
$ A: num [1:10, 1] -0.9719 0.0659 -1.2399 1.8743 0.2528 ...
..- attr(*, "scaled:center")= num 0.132
..- attr(*, "scaled:scale")= num 0.781
$ b: num [1:10, 1] -1.486 -1.156 -0.826 -0.495 -0.165 ...
..- attr(*, "scaled:center")= num 5.5
..- attr(*, "scaled:scale")= num 3.03
$ C: Factor w/ 2 levels "1","2": 1 2 1 2 1 2 1 2 1 2
>
Here is a dplyr and scale solution.
For dplyr < 1.0.0
require(dplyr)
df %>% mutate_if(is.numeric, scale)
# a runif(20) rnorm(20)
#1 y 0.5783877 -0.004177104
#2 n -0.2344854 -0.866626472
#3 m 1.5629961 1.526857969
#4 h 0.9648646 -1.557975547
#5 u -0.7212756 0.533400304
#6 u 1.4753675 -0.072289864
#7 b 0.5346870 -0.464299111
#8 l -0.4287559 0.426600473
#9 m -1.2050841 -0.880135405
#10 h -0.6150410 -0.040636433
#11 r 1.3768249 -0.719785950
#12 a -1.3929511 0.083010969
#13 a -0.4422665 0.385574213
#14 l -0.7719473 -0.934716525
#15 m 1.4483803 0.131974911
#16 k 0.6291919 2.598581195
#17 k -1.0356817 -1.018890381
#18 s -1.0960083 1.560216350
#19 y -0.8826702 -0.367821579
#20 v 0.2554671 -0.318862011
For dplyr >= 1.0.0
df %>% mutate(across(where(is.numeric), scale))
Note that scale(x) will do the same as (x - mean(x)) / sd(x); if you want to scale based on different metrics (e.g. a robust/modified Z score based on the median and MAD) you can do that using sweep.
Sample data
set.seed(2017);
df <- cbind.data.frame(a = factor(sample(letters, 20, replace = T)), runif(20), rnorm(20));
ind <- sapply(dcc, is.numeric)
dcc.s <- as.data.frame(lapply(dcc[,ind], function(x) (x-mean(x))/sd(x)))
dcc.s <- cbind(dcc, dcc.s)
If you don't need the "old" dataframe you can also do
ind <- sapply(dcc, is.numeric)
dcc[,ind] <- vapply(dcc[,ind], function(x) (x-mean(x))/sd(x))

R: Extracting lines from dataframes in list and splitting into new dataframes

I have a list with 3 data frames (DvE, DvS, EvS) in it:
str(Table.list2)
List of 3
$ DvE:'data.frame': 18482 obs. of 4 variables:
..$ gene : Factor w/ 18482 levels "c10000_g1_i3|m.32237",..: 1 2 3 4 5 6 7 8 9 10 ...
..$ FDR : num [1:18482] 0.502 0.982 0.936 0.411 0.461 ...
..$ log2FC : num [1:18482] 0.415 -0.245 0.728 -0.384 0.474 ...
..$ annotation: Factor w/ 4939 levels "","[Genbank](myosin heavy-chain) kinase [Calothrix sp. PCC 6303] ",..: 1 2204 2980 2204 1 2204 4622 2980 1 241 ...
$ DvS:'data.frame': 18482 obs. of 4 variables:
..$ gene : Factor w/ 18482 levels "c10000_g1_i3|m.32237",..: 1 2 3 4 5 6 7 8 9 10 ...
..$ FDR : num [1:18482] 1.25e-01 7.18e-01 2.02e-01 2.72e-13 6.02e-01 ...
..$ log2FC : num [1:18482] -0.417 0.583 2.148 1.689 -0.167 ...
..$ annotation: Factor w/ 4939 levels "","[Genbank](myosin heavy-chain) kinase [Calothrix sp. PCC 6303] ",..: 1 2204 2980 2204 1 2204 4622 2980 1 241 ...
$ EvS:'data.frame': 18482 obs. of 4 variables:
..$ gene : Factor w/ 18482 levels "c10000_g1_i3|m.32237",..: 1 2 3 4 5 6 7 8 9 10 ...
..$ FDR : num [1:18482] 1.78e-03 6.04e-01 4.09e-01 3.42e-19 3.20e-02 ...
..$ log2FC : num [1:18482] -0.832 0.828 1.42 2.073 -0.641 ...
..$ annotation: Factor w/ 4939 levels "","[Genbank](myosin heavy-chain) kinase [Calothrix sp. PCC 6303] ",..: 1 2204 2980 2204 1 2204 4622 2980 1 241 ...
all 3 dataframes have similar structure, e.g.:
> head(Table.list2$DvE)
gene FDR log2FC annotation
1 c10000_g1_i3|m.32237 0.5024600 0.4149066
2 c10000_g1_i4|m.32240 0.9818297 -0.2449509 [Pfam]Calcium-activated chloride channel
3 c10000_g1_i4|m.32242 0.9361868 0.7277203 [Pfam]LSM domain
4 c10000_g1_i5|m.32244 0.4114795 -0.3835745 [Pfam]Calcium-activated chloride channel
5 c10000_g1_i6|m.32245 0.4605157 0.4739777
6 c10000_g1_i6|m.32246 0.4965353 -0.4607749 [Pfam]Calcium-activated chloride channel
What I'd like to do is in each data frame, take out data that has FDR < 0.05 and log2FC > 0 and put in a new data frame, and then take out data that has FDR < 0.05 and log2FC < 0 and put in another data frame.
So that from a list of 3 data frames, I'd get 6 new data frames that are named:
DvE.+
DvE.-
DvS.+
DvS.-
EvS.+
EvS.-
Example output of DvE.+:
gene FDR log2FC annotation
47 c10010_g1_i4|m.32346 8.609296e-15 1.9188013 [Genbank]conserved unknown protein [Ectocarpus siliculosus]
48 c10010_g1_i4|m.32348 5.625766e-09 1.8240089 [Genbank]hypothetical protein THAOC_07134 [Thalassiosira oceanica]
155 c10037_g1_i4|m.32582 2.666894e-02 0.6669399 [Pfam]LETM1-like protein
211 c10050_g2_i2|m.32706 8.154555e-03 1.6900611 [Genbank]hypothetical protein SELMODRAFT_84252 [Selaginella moellendorffii]
243 c10057_g1_i1|m.32812 1.936893e-02 0.8141790 [Pfam]Fibrinogen alpha/beta chain family
265 c10061_g4_i2|m.32861 3.614401e-02 1.7059034 [Pfam]Maf1 regulator
I was wondering if there's a more elegant way/loop that I can do all this in rather than repeatedly writing out similar command lines?
Update:
I tried doing this:
DEG.list <- lapply(Table.list2, function(i){
pos <- i[(i$FDR < 0.05 & i$log2FC > 0),]
neg <- i[(i$FDR < 0.05 & i$log2FC < 0),]
assign(paste(i, ".+", sep=""), value=pos)
assign(paste(i, ".-", sep=""), value=neg)
})
But I got this error:
Warning messages:
1: In assign(paste(i, ".+", sep = ""), value = pos) :
only the first element is used as variable name
2: In assign(paste(i, ".-", sep = ""), value = neg) :
only the first element is used as variable name
3: In assign(paste(i, ".+", sep = ""), value = pos) :
only the first element is used as variable name
4: In assign(paste(i, ".-", sep = ""), value = neg) :
only the first element is used as variable name
5: In assign(paste(i, ".+", sep = ""), value = pos) :
only the first element is used as variable name
6: In assign(paste(i, ".-", sep = ""), value = neg) :
only the first element is used as variable name
Not tested:
listdf<-list(DvE, DvS, EvS)
library(dplyr) # filtering the data
alldf<-lapply(listdf, function(i) { # Each list contains two filtered dataframes
df1<-filter(i,FDR < 0.05 & log2FC > 0) # dfs have not been properly named here
df2<-filter(i,FDR < 0.05 & log2FC < 0)
list(df1,df2)
}

Build a proper dataframe from a matrix list after importing .xlsx file

Implemented:
I am importing a .xlsx file into R.
This file consists of three sheets.
I am binding all the sheets into a list.
Need to Implement
Now I want to combine this matrix lists into a single data.frame. With the header being the --> names(dataset).
I tried using the as.data.frame with read.xlsx as given in the help but it did not work.
I explicitly tried with as.data.frame(as.table(dataset)) but still it generates a long list of data.frame but nothing that I want.
I want to have a structure like
header = names and the values below that, just like how the read.table imports the data.
This is the code I am using:
xlfile <- list.files(pattern = "*.xlsx")
wb <- loadWorkbook(xlfile)
sheet_ct <- wb$getNumberOfSheets()
b <- rbind(list(lapply(1:sheet_ct, function(x) {
res <- read.xlsx(xlfile, x, as.data.frame = TRUE, header = TRUE)
})))
b <- b [-c(1),] # Just want to remove the second header
I want to have the data arrangement something like below.
Ei Mi hours Nphy Cphy CHLphy Nhet Chet Ndet Cdet DON DOC DIN DIC AT dCCHO TEPC Ncocco Ccocco CHLcocco PICcocco par Temp Sal co2atm u10 dicfl co2ppm co2mol pH
1 1 1 1 0.1023488 0.6534707 0.1053458 0.04994161 0.3308593 0.04991916 0.3307085 0.05042275 49.76304 14.99330000 2050.132 2150.007 0.9642220 0.1339044 0.1040715 0.6500288 0.1087667 0.1000664 0.0000000 9.900000 31.31000 370 0.01 -2.963256000 565.1855 0.02562326 7.879427
2 1 1 2 0.1045240 0.6448216 0.1103250 0.04988347 0.3304699 0.04984045 0.3301691 0.05085697 49.52745 14.98729000 2050.264 2150.007 0.9308690 0.1652179 0.1076058 0.6386706 0.1164099 0.1001396 0.0000000 9.900000 31.31000 370 0.01 -2.971632000 565.7373 0.02564828 7.879042
3 1 1 3 0.1064772 0.6369597 0.1148174 0.04982555 0.3300819 0.04976363 0.3296314 0.05130091 49.29323 14.98221000 2050.396 2150.007 0.8997098 0.1941872 0.1104229 0.6291149 0.1225822 0.1007908 0.8695131 9.900000 31.31000 370 0.01 -2.980446000 566.3179 0.02567460 7.878636
4 1 1 4 0.1081702 0.6299084 0.1187672 0.04976784 0.3296952 0.04968840 0.3290949 0.05175249 49.06034 14.97810000 2050.524 2150.007 0.8705440 0.2210289 0.1125141 0.6213265 0.1273103 0.1018360 1.5513170 9.900000 31.31000 370 0.01 -2.989259000 566.8983 0.02570091 7.878231
5 1 1 5 0.1095905 0.6239005 0.1221460 0.04971029 0.3293089 0.04961446 0.3285598 0.05220978 48.82878 14.97485000 2050.641 2150.007 0.8431960 0.2459341 0.1140222 0.6152447 0.1308843 0.1034179 2.7777070 9.900000
Please dont suggest me to have all data on a single sheet and also convert .xlsx to .csv or simple text format. I am trying really hard to have a proper dataframe from a .xlsx file.
Following is the file
And this is the post following : Followup
This is what resulted:
str(full_data)
'data.frame': 0 obs. of 19 variables:
$ Experiment : Factor w/ 2 levels "#","1":
$ Mesocosm : Factor w/ 10 levels "#","1","2","3",..:
$ Exp.day : Factor w/ 24 levels "1","10","11",..:
$ Hour : Factor w/ 24 levels "108","12","132",..:
$ Temperature: Factor w/ 125 levels "10","10.01","10.02",..:
$ Salinity : num
$ pH : num
$ DIC : Factor w/ 205 levels "1582.2925","1588.6475",..:
$ TA : Factor w/ 117 levels "1813","1826",..:
$ DIN : Factor w/ 66 levels "0.2","0.3","0.4",..:
$ Chl.a : Factor w/ 156 levels "0.171","0.22",..:
$ PIC : Factor w/ 194 levels "-0.47","-0.96",..:
$ POC : Factor w/ 199 levels "-0.046","1.733",..:
$ PON : Factor w/ 151 levels "1.675","1.723",..:
$ POP : Factor w/ 110 levels "0.032","0.034",..:
$ DOC : Factor w/ 93 levels "100.1","100.4",..:
$ DON : Factor w/ 1 level "µmol/L":
$ DOP : Factor w/ 1 level "µmol/L":
$ TEP : Factor w/ 100 levels "10.4934","11.0053",..:
[Note: Above is the structure after reading from .xlsx file......the levels makes the calculation and manipulation part tedious and messy.]
This is what I want to achieve:
str(a)
'data.frame': 9936 obs. of 29 variables:
$ Ei : int 1 1 1 1 1 1 1 1 1 1 ...
$ Mi : int 1 1 1 1 1 1 1 1 1 1 ...
$ hours : int 1 2 3 4 5 6 7 8 9 10 ...
$ Cphy : num 0.653 0.645 0.637 0.63 0.624 ...
$ CHLphy : num 0.105 0.11 0.115 0.119 0.122 ...
$ Nhet : num 0.0499 0.0499 0.0498 0.0498 0.0497 ...
$ Chet : num 0.331 0.33 0.33 0.33 0.329 ...
$ Ndet : num 0.0499 0.0498 0.0498 0.0497 0.0496 ...
$ Cdet : num 0.331 0.33 0.33 0.329 0.329 ...
$ DON : num 0.0504 0.0509 0.0513 0.0518 0.0522 ...
$ DOC : num 49.8 49.5 49.3 49.1 48.8 ...
$ DIN : num 15 15 15 15 15 ...
$ DIC : num 2050 2050 2050 2051 2051 ...
$ AT : num 2150 2150 2150 2150 2150 ...
$ dCCHO : num 0.964 0.931 0.9 0.871 0.843 ...
$ TEPC : num 0.134 0.165 0.194 0.221 0.246 ...
$ Ncocco : num 0.104 0.108 0.11 0.113 0.114 ...
$ Ccocco : num 0.65 0.639 0.629 0.621 0.615 ...
$ CHLcocco: num 0.109 0.116 0.123 0.127 0.131 ...
$ PICcocco: num 0.1 0.1 0.101 0.102 0.103 ...
$ par : num 0 0 0.87 1.55 2.78 ...
$ Temp : num 9.9 9.9 9.9 9.9 9.9 9.9 9.9 9.9 9.9 9.9 ...
$ Sal : num 31.3 31.3 31.3 31.3 31.3 ...
$ co2atm : num 370 370 370 370 370 370 370 370 370 370 ...
$ u10 : num 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01 ...
$ dicfl : num -2.96 -2.97 -2.98 -2.99 -3 ...
$ co2ppm : num 565 566 566 567 567 ...
$ co2mol : num 0.0256 0.0256 0.0257 0.0257 0.0257 ...
$ pH : num 7.88 7.88 7.88 7.88 7.88 ...
[Note: sorry for the extra columns, this is another dataset (simple text), which I am reading from read.table]
With NA's handled:
> unique(mydf_1$Exp.num)
[1] # 1
Levels: # 1
> unique(mydf_2$Exp.num)
[1] # 2
Levels: # 2
> unique(mydf_3$Exp.num)
[1] # 3
Levels: # 3
> unique(full_data$Exp.num)
[1] 2 3 4
Without handling NA's:
> unique(full_data$Exp.num)
[1] 1 NA 2 3
> unique(full_data$Mesocosm)
[1] 1 2 3 4 5 6 7 8 9 NA
I think this is what you need. I add a few comments on what I am doing:
xlfile <- list.files(pattern = "*.xlsx")
wb <- loadWorkbook(xlfile)
sheet_ct <- wb$getNumberOfSheets()
for( i in 1:sheet_ct) { #read the sheets into 3 separate dataframes (mydf_1, mydf_2, mydf3)
print(i)
variable_name <- sprintf('mydf_%s',i)
assign(variable_name, read.xlsx(xlfile, sheetIndex=i,startRow=1, endRow=209)) #using this you don't need to use my formula to eliminate NAs. but you need to specify the first and last rows.
}
colnames(mydf_1) <- names(mydf_2) #this here was unclear. I chose the second sheet's
# names as column names but you can chose whichever you want using the same (second and third column had the same names).
#some of the sheets were loaded with a few blank rows (full of NAs) which I remove
#with the following function according to the first column which is always populated
#according to what I see
remove_na_rows <- function(x) {
x <- x[!is.na(x)]
a <- length(x==TRUE)
}
mydf_1 <- mydf_1[1:remove_na_rows(mydf_1$Exp.num),]
mydf_2 <- mydf_2[1:remove_na_rows(mydf_2$Exp.num),]
mydf_3 <- mydf_3[1:remove_na_rows(mydf_3$Exp.num),]
full_data <- rbind(mydf_1[-1,],mydf_2[-1,],mydf_3[-1,]) #making one dataframe here
full_data <- lapply(full_data,function(x) as.numeric(x)) #convert fields to numeric
full_data2$Ei <- as.integer(full_data[['Ei']]) #use this to convert any column to integer
full_data2$Mi <- as.integer(full_data[['Mi']])
full_data2$hours <- as.integer(full_data[['hours']])
#*********code to use for removing NA rows *****************
#so if you rbind not caring about the NA rows you can use the below to get rid of them
#I just tested it and it seems to be working
n_row <- NULL
for ( i in 1:nrow(full_data)) {
x <- full_data[i,]
if ( all(is.na(x)) ) {
n_row <- append(n_row,i)
}
}
full_data <- full_data[-n_row,]
I think now this is what you need

Resources