Summary statistics using ddply - r

I like to write a function using ddply that outputs the summary statistics based on the name of two columns of data.frame mat.
mat is a big data.frame with the name of columns "metric", "length", "species", "tree", ...,"index"
index is factor with 2 levels "Short", "Long"
"metric", "length", "species", "tree" and others are all continuous variables
Function:
summary1 <- function(arg1,arg2) {
...
ss <- ddply(mat, .(index), function(X) data.frame(
arg1 = as.list(summary(X$arg1)),
arg2 = as.list(summary(X$arg2)),
.parallel = FALSE)
ss
}
I expect the output to look like this after calling summary1("metric","length")
Short metric.Min. metric.1st.Qu. metric.Median metric.Mean metric.3rd.Qu. metric.Max. length.Min. length.1st.Qu. length
.Median length.Mean length.3rd.Qu. length.Max.
....
Long metric.Min. metric.1st.Qu. metric.Median metric.Mean metric.3rd.Qu. metric.Max. length.Min. length.1st.Qu. length
.Median length.Mean length.3rd.Qu. length.Max.
....
At the moment the function does not produce the desired output? What modification should be made here?
Thanks for your help.
Here is a toy example
mat <- data.frame(
metric = rpois(10,10), length = rpois(10,10), species = rpois(10,10),
tree = rpois(10,10), index = c(rep("Short",5),rep("Long",5))
)

As Nick wrote in his answer you can't use $ to reference variable passed as character name. When you wrote X$arg1 then R search for column named "arg1" in data.frame X. You can reference to it either by X[,arg1] or X[[arg1]].
And if you want nicely named output I propose below solution:
summary1 <- function(arg1, arg2) {
ss <- ddply(mat, .(index), function(X) data.frame(
setNames(
list(as.list(summary(X[[arg1]])), as.list(summary(X[[arg2]]))),
c(arg1,arg2)
)), .parallel = FALSE)
ss
}
summary1("metric","length")
Output for toy data is:
index metric.Min. metric.1st.Qu. metric.Median metric.Mean metric.3rd.Qu.
1 Long 5 7 10 8.6 10
2 Short 7 7 9 8.8 10
metric.Max. length.Min. length.1st.Qu. length.Median length.Mean length.3rd.Qu.
1 11 9 10 11 10.8 12
2 11 4 9 9 9.0 11
length.Max.
1 12
2 12

Is this more like what you want?
summary1 <- function(arg1,arg2) {
ss <- ddply(mat, .(index), function(X){ data.frame(
arg1 = as.list(summary(X[,arg1])),
arg2 = as.list(summary(X[,arg2])),
.parallel = FALSE)})
ss
}

Related

Loop same actions in R

Have an issue here.
I want to loop my operations in R, however, do not know how to make this properly and efficiently.
I have several different sized datasets and performing the same block of code each time is time-consuming.
Here is the code I need to apply to each of the datasets and write the data or the output from a model into the datasets with different names.
##########################################################################################################################
#the combined list of separate data frames where the last letter is changing A, B, C...
z <- list(Data_A, Data_B, Data_C)
#need to loop these operations performed by using data from datasets. Here is an example by using data from Data_A dataset.
# TFP estimation by using ACF method
ACF_A <- prodest::prodestACF(Data_A$turn, fX = Data_A$cogs, sX = Data_A$tfa, pX = Data_A$cogs, idvar = Data_A$ID, timevar = Data_A$Year,
R = 100, cX = NULL, opt = 'DEoptim', theta0 = NULL, cluster = NULL)
omegaACF_A <- prodest::omega(ACF_A)
Data_A$omegaACF_A <- prodest::omega(ACF_A)
#########################################################################################################################
# Growth variables
Data_A <- Data_A %>%
arrange(ID, Year) %>%
group_by(ID) %>%
mutate(domegaACF_A = omegaACF_A - dplyr::lag(omegaACF_A),
debt = LOAN + LTD,
ddebt = debt - dplyr::lag(debt),
dsales = SALE - dplyr::lag(SALE)) %>%
ungroup
# Panel data frame
PData_A <- pdata.frame(Data_A, index = c("ID","Year"))
# Within estimator
within_2way_A <- plm(domegaACF_A ~ dplyr::lag(domegaACF_A, 1) + dplyr::lag(domegaACF_A, 2) + ddebt + lag(ff1, 1) + ddebt:lag(ff1, 1) + log(Age) + ta + dsales,
data = PData_A, effect = "twoways", model ="within", index = c("ID", "Year"))
The main problem is that I do not know how to store the data in separate datasets with according names. For example, in the block of the following code, _A should be changing to _B, _C according to the dataset that is used.
ACF_A <- prodest::prodestACF(Data_A$turn, fX = Data_A$cogs, sX = Data_A$tfa, pX = Data_A$cogs, idvar = Data_A$ID, timevar = Data_A$Year,
R = 100, cX = NULL, opt = 'DEoptim', theta0 = NULL, cluster = NULL)
omegaACF_A <- prodest::omega(ACF_A)
Data_A$omegaACF_A <- prodest::omega(ACF_A)
I know there are lapply and for loops but I do not know how to use them with changing names of storing variables:
z <- list (df1, df2, df3)
for (i in z){
ACF_[1 or 2 or 3] <- prodest::prodestACF(i$turn, fX = i$cogs, sX = i$tfa, pX = i$cogs, idvar = i$ID, timevar = i$Year,
R = 100, cX = NULL, opt = 'DEoptim', theta0 = NULL, cluster = NULL)
omegaACF_[1 or 2 or 3] <- prodest::omega(ACF_[1 or 2 or 3])
Data_[]$omegaACF_[1 or 2 or 3] <- prodest::omega(ACF_[1 or 2 or 3])
{
UPD: Here are several datasets: https://drive.google.com/drive/folders/1gBV2ZkywW6JqDjRICafCwtYhh2DHWaUq?usp=sharing
UPD2:
Data_A
turn cogs tfa SALE
1 1 1 1
2 2 2 2
3 3 3 3
4 4 4 4
Data_B
turn cogs tfa SALE
5 5 5 5
6 6 6 6
7 7 7 7
8 8 8 8
After running the loop I need:
ACF_A, ACF_B, etc. storage variable, where the results of the estimations of prodest function will be stored
omegaACF_A, omegaACF_B, etc. storage where omega variable from prodest will be stored
omegaACF_A, omegaACF_B results of estimations should be added to Data_A, Data_B datasets accordingly as a new variable.
After that, in Data_A, Data_B datasets, growth variables should be created
The plm regression should be stored in within_2way_A, within_2way_B accordingly
So in the end, I need:
Data_A
turn cogs tfa SALE omegaACF_A domegaACF_A debt ddebt dsales
1 1 1 1 0.1 NA 1 NA NA
2 2 2 2 0.3 0.2 2 1 1
3 3 3 3 0.6 0.3 3 1 1
4 4 4 4 0.9 0.3 4 1 1
Data_B
turn cogs tfa SALE omegaACF_B domegaACF_B debt ddebt dsales
5 5 5 5 1.1 NA 5 NA NA
6 6 6 6 1.5 0.4 6 1 1
7 7 7 7 1.7 0.2 7 1 1
8 8 8 8 2.0 0.3 8 1 1
One approach is to separate the ACF estimation and omega calculation from the summary creation with different lapply() commands. Since you did not supply any example data, it's a blind shot, but try the following. Note that I assumed that every dataset has the same column names! In case it doesn't solve your problem I will remove my answer.
data <- list(Data_A, Data_B, Data_C)
Estimates <- lapply(data, function(x){
prodest::prodestACF(x$turn, fX = x$cogs, sX = x$tfa, pX = x$cogs, idvar = x$ID, timevar = x$Year,
R = 100, cX = NULL, opt = 'DEoptim', theta0 = NULL, cluster = NULL)
}
Summaries_estimates <- lapply(Estimates, summary)
Omegas <- lapply(Estimates, function(x) prodest::omega(x))
Summaries_omega <- lapply(Omegas, summary)
Alternative using loops
Since you asked, it is also possible to define a loop that loops everything together though this is usually much slower. For this, we have to define empty lists that carry the results of ACF etc. and loop over the lists of data.frames that we already created.
data <- list(Data_A, Data_B, Data_C)
Estimates <- list()
Summaries_estimates <- list()
Omegas <- list()
Summaries_omegas <- list()
for(i in 1:(length(data))){
Estimates[[i]] <- prodest::prodestACF(data[[i]]$turn, fX = data[[i]]$cogs, sX = data[[i]]$tfa, pX = data[[i]]$cogs, idvar = data[[i]]$ID, timevar = data[[i]]$Year,
R = 100, cX = NULL, opt = 'DEoptim', theta0 = NULL, cluster = NULL)
}
Summaries_estimates[[i]] <- summary(Estimates[[i]])
Omegas[[i]] <- prodest::omega(Estimates[[i]]))
Summaries_omega[[i]] <- summary(Omegas[[i]])
}

Selecting 10 names based on 10 highest numbers of other column

I want to select the top 10 voted restaurants, and plot them together.
So i want to create a plot that shows the restaurant names and their votes.
I used:
topTenVotes <- top_n(dataSet, 10, Votes)
and it showed me data of the columns in dataset based on the top 10 highest votes, however i want just the number of votes and restaurant names.
My Question is how to select only the top 10 highest votes and their restaurant names, and plotting them together?
expected output:
Restaurant Names Votes
A 300
B 250
C 230
D 220
E 210
F 205
G 200
H 194
I 160
J 120
K 34
And then a bar plot that shows these restaurant names and their votes
Another simple approach with base functions creating another variable:
df <- data.frame(Names = LETTERS, Votes = sample(40:400, length(LETTERS)))
x <- df$Votes
names(x) <- df$Names # x <- setNames(df$Votes, df$Names) is another approach
barplot(sort(x, decreasing = TRUE)[1:10], xlab = "Restaurant Name", ylab = "Votes")
Or a one-line solution with base functions:
barplot(sort(xtabs(Votes ~ Names, df), decreasing = TRUE)[1:10], xlab = "Restaurant Names")
I'm not seeing a data set to use, so here's a minimal example to show how it might work:
library(tidyverse)
df <-
tibble(
restaurant = c("res1", "res2", "res3", "res4"),
votes = c(2, 5, 8, 6)
)
df %>%
arrange(-votes) %>%
head(3) %>%
ggplot(aes(x = reorder(restaurant, votes), y = votes)) +
geom_col() +
coord_flip()
The top_n command also works in this case but is designed for grouped data.
Its more efficient, though less readable, to use base functions:
#toy data
d <- data.frame(list(Names = sample(LETTERS, size = 15), value = rnorm(25, 10, n = 15)))
head(d)
Names value
1 D 25.592749
2 B 28.362303
3 H 1.576343
4 L 28.718517
5 S 27.648078
6 Y 29.364797
#reorder by, and retain, the top 10
newdata <- data.frame()
for (i in 1:10) {
newdata <- rbind(newdata,d[which(d$value == sort(d$value, decreasing = T)[1:10][i]),])
}
newdata
Names value
8 W 45.11330
13 K 36.50623
14 P 31.33122
15 T 30.28397
6 Y 29.36480
7 Q 29.29337
4 L 28.71852
10 Z 28.62501
2 B 28.36230
5 S 27.64808

Randomly select a certain percentage of rows and create new columns

I have a species column containing 10 species names. I have to distribute the species into four columns randomly such that each column will take a specific percentage of species.
Let's say the first column takes 20%, the second 30%, the third 40% and the last 10%. The four columns will be four different environments i.e.:
Restricted, Tidalflat, beach, estuary
Hence the column intake will be predefined but the selection will be random.
My input data will look like this:
species <- c('Natica','Tellina','Mactra','Natica','Arca','Arca','Tellina',
'Nassarius','Cardium','Cardium')
Result should look like this:
Some simple setup:
species <- c('Natica','Tellina','Mactra','Natica','Arca','Arca','Tellina',
'Nassarius','Cardium','Cardium')
rspecies <- sample(species)
envirs <- c('Restricted', 'Tidalflat', 'Beach', 'Estuary')
probs <- c(.2, .3, .4, .1)
nrs <- round(length(species) * probs)
Now, a data.frame with separate columns is not a very good way of expressing your data, as your data is not rectangular, i.e. you don't have the same number of observations in each column.
You can either present the data in long form:
df <- data.frame(species = rspecies, envir = rep(envirs, nrs), stringsAsFactors = FALSE)
species envir
1 Tellina Restricted
2 Natica Restricted
3 Arca Tidalflat
4 Mactra Tidalflat
5 Tellina Tidalflat
6 Arca Beach
7 Nassarius Beach
8 Cardium Beach
9 Cardium Beach
10 Natica Estuary
Or as a list:
split(rspecies, df$envir)
$Beach
[1] "Mactra" "Natica" "Arca" "Arca"
$Estuary
[1] "Tellina"
$Restricted
[1] "Nassarius" "Cardium"
$Tidalflat
[1] "Cardium" "Natica" "Tellina"
Edit:
One way to accommodate different number of species, is to make the assignment probabilistic according the environment. This will work better the larger the actual dataset is.
species2 <- c('Natica','Tellina','Mactra','Natica','Arca','Arca','Tellina',
'Nassarius','Cardium','Cardium', 'Cardium')
length(species2)
[1] 11
grps <- sample(envirs, size = length(species2), prob = probs, replace = TRUE)
df2 <- data.frame(species = species2, envir = grps, stringsAsFactors = FALSE)
df2 <- df2[order(df2$envir), ]
species envir
5 Arca Beach
10 Cardium Beach
1 Natica Estuary
11 Cardium Estuary
3 Mactra Restricted
7 Tellina Restricted
2 Tellina Tidalflat
4 Natica Tidalflat
6 Arca Tidalflat
8 Nassarius Tidalflat
9 Cardium Tidalflat
Maybe not in one line of code. I did not understand the column part, but you could use below to create a data frame but your column lengths are unequal.
species <- 1:1000
ranspecies <- sample(species)
first20 <- ranspecies[1:(floor(length(species)*.20))]
second30 <- ranspecies[(floor(length(species)*.20)+1):(floor(length(species)*.50))]
third40 <- ranspecies[(floor(length(species)*.50)+1):(floor(length(species)*.90))]
forth10 <- ranspecies[(floor(length(species)*.90)+1):length(species)]
or to match your example
species <- c('Natica'
,'Tellina'
,'Mactra'
,'Natica'
,'Arca'
,'Arca'
,'Tellina'
,'Nassarius'
,'Cardium'
,'Cardium')
ranspecies <- sample(species)
first20 <- ranspecies[1:(floor(length(species)*.20))]
second30 <- ranspecies[(floor(length(species)*.20)+1):(floor(length(species)*.50))]
third40 <- ranspecies[(floor(length(species)*.50)+1):(floor(length(species)*.90))]
forth10 <- ranspecies[(floor(length(species)*.90)+1):length(species)]
dflength <- max(length(first20), length(second30), length(third40),length(forth10))
data.frame(f = c(first20,rep(NA,dflength-length(first20)))
,s = c(second30,rep(NA,dflength-length(second30)))
,t = c(third40,rep(NA,dflength-length(third40)))
,f = c(forth10,rep(NA,dflength-length(forth10)))
)
Allthough I feel that some of the steps can be more compact. But I'll let you fiddle with it some more.

Merge 4 data objects with different columns (variables) in R

So initially I had the following object:
> head(gs)
year disturbance lek_id complex tot_male
1 2006 N 3T Diamond 3
2 2007 N 3T Diamond 17
3 1981 N bare 3corners 4
4 1982 N bare 3corners 7
5 1983 N bare 3corners 2
6 1985 N bare 3corners 5
With that I computed general statistics min, max, mean, and sd of tot_male for year within complex. I used R data splitting functions, and assigned logical column names where it seemed appropriate and ultimately made them different objects.
> tyc_min = aggregate(gs$tot_male, by=list(gs$year, gs$complex), FUN=min)
> names(tyc_min) = c("year", "complex", "tot_male_min")
> tyc_max = aggregate(gs$tot_male, by=list(gs$year, gs$complex), FUN=max)
> names(tyc_max) = c("year", "complex", "tot_male_max")
> tyc_mean = aggregate(gs$tot_male, by=list(gs$year, gs$complex), FUN=mean)
> names(tyc_mean) = c("year", "complex", "tot_male_mean")
> tyc_sd = aggregate(gs$tot_male, by=list(gs$year, gs$complex), FUN=sd)
> names(tyc_sd) = c("year", "complex", "tot_male_sd")
Example Output (2nd Object - Tyc_max):
year complex tot_male_max
1 2003 0
2 1970 3corners 26
3 1971 3corners 22
4 1972 3corners 26
5 1973 3corners 32
6 1974 3corners 18
Now I need to add the number of samples per year/complex combination as well. Then I need to merge these into single data object, and export as a .csv file
I know I need to use merge() function along with all.y but have no idea how to handle this error:
Error in fix.by(by.x, x) :
'by' must specify one or more columns as numbers, names or logical
Or.. add the number of samples per year and complex. Any suggestions?
This might work (but hard to check without a reproducible example):
gsnew <- Reduce(function(...) merge(..., all = TRUE, by = c("year","complex")),
list(tyc_min, tyc_max, tyc_mean, tyc_sd))
But instead of aggregating for the separate statistics and then merging, you can also aggregate everything at once into a new dataframe / datatable with for example data.table, dplyr or base R. Then you don't have to merge afterwards (for a base R solution see the other answer):
library(data.table)
gsnew <- setDT(gs)[, .(male_min = min(tot_male),
male_max = max(tot_male),
male_mean = mean(tot_male),
male_sd = sd(tot_male), by = .(year, complex)]
library(dplyr)
gsnew <- gs %>% group_by(year, complex) %>%
summarise(male_min = min(tot_male),
male_max = max(tot_male),
male_mean = mean(tot_male),
male_sd = sd(tot_male))
mystat <- function(x) c(mi=min(x), ma=max(x))
aggregate(Sepal.Length~Species, FUN=mystat, data=iris)
for you:
mystat <- function(x) c(mi=min(x), ma=max(x), m=mean(x), s=sd(x), l=length(x))
aggregate(tot_male~year+complex, FUN=mystat, data=gs)

Create a character vector column of predefined text and bind it to existing dataframe using rbind or bind_rows

Good day,
I will present two [likely] very puny problems for your excellent review.
Problem #1
I have a relatively tidy df (dat) with dim 10299 x 563. The 563 variables common to both datasets [that created] dat are 'subject' (numeric), 'label' (numeric), 3:563 (variable names from a text file). Observations 1:2947 are from a 'test' dataset whereas observations 2948:10299 are from a 'training' dataset.
I'd like to insert a column (header = 'type') into dat that is basically rows 1:2947 comprised of string test and rows 2948:10299 of string train that way I can group later on dataset or other similar aggregate functions in dplyr/tidyr.
I created a test df (testdf = 1:10299: dim(testdf) = 102499 x 1) and then:
testdat[1:2947 , "type"] <- c("test")
testdat[2948:10299, "type"] <- c("train")
> head(ds, 2);tail(ds, 2)
X1.10299 type
1 1 test
2 2 test
X1.10299 type
10298 10298 train
10299 10299 train
So I really don't like that there is now a column of X1.10299.
Questions:
Is there a better and more expedient way to create a column that has what I'm looking for based upon my use case above?
What is a good way to actually insert that column into 'dat' so that I can use it later for grouping with dplyr?
Problem #2
The way I arrived at my [nearly] tidy df (dat) from above was to two take dfs (test and train) of the form dim(2947 x 563 and 7352 x 563), respectively, and rbinding them together.
I confirm that all of my variable names are present after the binding effort by something like this:
test.names <- names(test)
train.names <- names(train)
identical(test.names, train.names)
> TRUE
What is interesting and of primary concern is that if I try to use the bind_rows function from 'dplyr' to perform the same binding exercise:
dat <- bind_rows(test, train)
It returns a dataframe that apparently keeps my all of my observations (x: 10299) but now my variable count is reduced from 563 to 470!
Question:
Does anyone know why my variables are being chopped?
Is this the best way to combine two dfs of the same structure for later slicing/dicing with dplyr/
tidyr?
Thank you for your time and consideration of these matters.
Sample test/train dfs for review (the left most numeric are df indices):
test df
test[1:10, 1:5]
subject labels tBodyAcc-mean()-X tBodyAcc-mean()-Y tBodyAcc-mean()-Z
1 2 5 0.2571778 -0.02328523 -0.01465376
2 2 5 0.2860267 -0.01316336 -0.11908252
3 2 5 0.2754848 -0.02605042 -0.11815167
4 2 5 0.2702982 -0.03261387 -0.11752018
5 2 5 0.2748330 -0.02784779 -0.12952716
6 2 5 0.2792199 -0.01862040 -0.11390197
7 2 5 0.2797459 -0.01827103 -0.10399988
8 2 5 0.2746005 -0.02503513 -0.11683085
9 2 5 0.2725287 -0.02095401 -0.11447249
10 2 5 0.2757457 -0.01037199 -0.09977589
train df
train[1:10, 1:5]
subject label tBodyAcc-mean()-X tBodyAcc-mean()-Y tBodyAcc-mean()-Z
1 1 5 0.2885845 -0.020294171 -0.1329051
2 1 5 0.2784188 -0.016410568 -0.1235202
3 1 5 0.2796531 -0.019467156 -0.1134617
4 1 5 0.2791739 -0.026200646 -0.1232826
5 1 5 0.2766288 -0.016569655 -0.1153619
6 1 5 0.2771988 -0.010097850 -0.1051373
7 1 5 0.2794539 -0.019640776 -0.1100221
8 1 5 0.2774325 -0.030488303 -0.1253604
9 1 5 0.2772934 -0.021750698 -0.1207508
10 1 5 0.2805857 -0.009960298 -0.1060652
Actual Code (ignore the function calls/I'm doing most of the testing via console).
[http://archive.ics.uci.edu/ml/machine-learning-databases/00240/]The data set I'm using with this code. 1
run_analysis <- function () {
#Vars available for use throughout the function that should be preserved
vars <- read.table("features.txt", header = FALSE, sep = "")
lookup_table <- data.frame(activitynum = c(1,2,3,4,5,6),
activity_label = c("walking", "walking_up",
"walking_down", "sitting",
"standing", "laying"))
test <- test_read_process(vars, lookup_table)
train <- train_read_process(vars, lookup_table)
}
test_read_process <- function(vars, lookup_table) {
#read in the three documents for cbinding later
test.sub <- read.table("test/subject_test.txt", header = FALSE)
test.labels <- read.table("test/y_test.txt", header = FALSE)
test.obs <- read.table("test/X_test.txt", header = FALSE, sep = "")
#cbind the cols together and set remaining colNames to var names in vars
test.dat <- cbind(test.sub, test.labels, test.obs)
colnames(test.dat) <- c("subject", "labels", as.character(vars[,2]))
#Use lookup_table to set the "test_labels" string values that correspond
#to their integer IDs
#test.lookup <- merge(test, lookup_table, by.x = "labels",
# by.y ="activitynum", all.x = T)
#Remove temporary symbols from globalEnv/memory
rm(test.sub, test.labels, test.obs)
#return
return(test.dat)
}
train_read_process <- function(vars, lookup_table) {
#read in the three documents for cbinding
train.sub <- read.table("train/subject_train.txt", header = FALSE)
train.labels <- read.table("train/y_train.txt", header = FALSE)
train.obs <- read.table("train/X_train.txt", header = FALSE, sep = "")
#cbind the cols together and set remaining colNames to var names in vars
train.dat <- cbind(train.sub, train.labels, train.obs)
colnames(train.dat) <- c("subject", "label", as.character(vars[,2]))
#Clean up temporary symbols from globalEnv/memory
rm(train.sub, train.labels, train.obs, vars)
return(train.dat)
}
The problem that you're facing stems from the fact that you have duplicated names in the variable list that you're using to create your data frame objects. If you ensure that the column names are unique and shared between the objects the code will run. I've included a fully working example based on the code you used above (with fixes and various edits noted in the comments):
vars <- read.table(file="features.txt", header=F, stringsAsFactors=F)
## FRS: This is the source of original problem:
duplicated(vars[,2])
vars[317:340,2]
duplicated(vars[317:340,2])
vars[396:419,2]
## FRS: I edited the following to both account for your data and variable
## issues:
test_read_process <- function() {
#read in the three documents for cbinding later
test.sub <- read.table("test/subject_test.txt", header = FALSE)
test.labels <- read.table("test/y_test.txt", header = FALSE)
test.obs <- read.table("test/X_test.txt", header = FALSE, sep = "")
#cbind the cols together and set remaining colNames to var names in vars
test.dat <- cbind(test.sub, test.labels, test.obs)
#colnames(test.dat) <- c("subject", "labels", as.character(vars[,2]))
colnames(test.dat) <- c("subject", "labels", paste0("V", 1:nrow(vars)))
return(test.dat)
}
train_read_process <- function() {
#read in the three documents for cbinding
train.sub <- read.table("train/subject_train.txt", header = FALSE)
train.labels <- read.table("train/y_train.txt", header = FALSE)
train.obs <- read.table("train/X_train.txt", header = FALSE, sep = "")
#cbind the cols together and set remaining colNames to var names in vars
train.dat <- cbind(train.sub, train.labels, train.obs)
#colnames(train.dat) <- c("subject", "labels", as.character(vars[,2]))
colnames(train.dat) <- c("subject", "labels", paste0("V", 1:nrow(vars)))
return(train.dat)
}
test_df <- test_read_process()
train_df <- train_read_process()
identical(names(test_df), names(train_df))
library("dplyr")
## FRS: These could be piped together but I've kept them separate for clarity:
train_df %>%
mutate(test="train") ->
train_df
test_df %>%
mutate(test="test") ->
test_df
test_df %>%
bind_rows(train_df) ->
out_df
head(out_df)
out_df
## FRS: You can set your column names to those of the original
## variable list but you still have duplicates to deal with:
names(out_df) <- c("subject", "labels", as.character(vars[,2]), "test")
duplicated(names(out_df))

Resources