R function keeps returning empty data frame - r

I'm trying to write an R function that loops through a given dataframe to filter it a bit. The data in the dataframe consists of travel information between two lines in the London subway an I'd like to cut off the top percent. Here's the output of the str() function for the input data:
'data.frame': 71748 obs. of 9 variables:
$ depart : Factor w/ 52 levels "Bank","Barkingside",..: 22 22 22 22 22 25 25 25 25 25 ...
$ arrival : Factor w/ 48 levels "Bank","Barkingside",..: 48 43 38 5 8 1 42 48 41 43 ...
$ traveltime : num 433 1102 161 584 891 ...
$ departuretime: POSIXlt, format: "2014-03-24 18:17:20" "2014-03-24 18:17:20" "2014-03-24 18:17:20" ...
$ arrivaltime : POSIXlt, format: "2014-03-24 18:24:33" "2014-03-24 18:35:42" "2014-03-24 18:20:01" ...
$ lcid : Factor w/ 28 levels "1000001","1000002",..: 1 1 1 1 1 1 1 1 1 1 ...
$ tripno : Factor w/ 25 levels "1","10","11",..: 2 2 2 2 2 2 2 2 2 2 ...
$ destination : Factor w/ 18 levels "Debden","Ealing Broadway",..: 3 3 3 3 3 3 3 3 3 3 ...
$ line : Factor w/ 1 level "C": 1 1 1 1 1 1 1 1 1 1 ...
Here's the functions I wrote:
#cut off top percent of travel times for each combination of arrival and
#departure stations to remove outliers
cutOffTopPercent <- function(data, percentage=0.99){
res <- data.frame()
#loop through all combinations of depart and arrival stations
for(i in 1:length(levels(data$depart))){
for(j in 1:length(levels(data$arrival))){
#create variables for departure/arrival station to make code easier to read
departureStation <- levels(data$depart)[i]
arrivalStation <- levels(data$arrival)[j]
#create a subset containing only the current departure and arrival station
dataSubset <- data[data$depart == departureStation & data$arrival == arrivalStation,]
#get top value that's allowed
upperBorder <- getTopPercentileBottom(dataSubset, percentage)
#remove records with values higher than than allowed
dataSubset <- dataSubset[dataSubset$traveltime < upperBorder,]
#glue the subset to the end result
res <- rbind(res,dataSubset)
}
}
return(res)
}
#returns the traveltime that marks where the given percentage of traveltimes starts
getTopPercentileBottom <- function(data, percentile){
upperBorder <- quantile(data$traveltime, probs = percentile)
return(upperBorder)
}
The cutOffTopPercent() function always returns an empty data frame however. I can't find my error. I've been trying to go to the steps manually, but when I do so, all the data subsets get appended to the res dataframe correctly.
Can anyone see what I did wrong, or suggest a better approach to what I'm trying to do?
EDIT:
a dput of the first 30 records in my input data:
structure(list(depart = structure(c(22L, 22L, 22L, 22L, 22L,
25L, 25L, 25L, 25L, 25L, 25L, 25L, 25L, 25L, 25L, 25L, 25L, 25L,
25L, 25L, 25L, 25L, 47L, 47L, 47L, 47L, 47L, 47L, 47L, 47L), .Label = c("Bank",
"Barkingside", "Bethnal Green", "Bond Street", "Buckhurst Hill",
"Chancery Lane", "Chigwell", "Debden", "Ealing Broadway", "East Acton",
"Epping", "Fairlop", "Gants Hill", "Grange Hill", "Greenford",
"Hainault", "Hanger Lane", "Holborn", "Holland Park", "Lancaster Gate",
"Leyton", "Leytonstone", "Liverpool Street", "Loughton", "Marble Arch",
"Mile End", "Newbury Park", "Newbury Park Loop", "North Acton",
"North Acton Junction", "Northolt", "Notting Hill Gate", "Oxford Circus",
"Perivale", "Queensway", "Redbridge", "Roding Valley", "Ruislip Gardens",
"Shepherd's Bush", "Shepherds Bush (Central Line)", "Snaresbrook",
"South Ruislip", "South Woodford", "St. Paul's", "Stratford",
"Theydon Bois", "Tottenham Court Road", "Wanstead", "West Acton",
"West Ruislip", "White City", "Woodford"), class = "factor"),
arrival = structure(c(48L, 43L, 38L, 5L, 8L, 1L, 42L, 48L,
41L, 43L, 6L, 38L, 5L, 4L, 16L, 30L, 44L, 20L, 8L, 3L, 24L,
19L, 1L, 42L, 48L, 41L, 43L, 6L, 38L, 5L), .Label = c("Bank",
"Barkingside", "Bethnal Green", "Bond Street", "Buckhurst Hill",
"Chancery Lane", "Chigwell", "Debden", "East Acton", "Fairlop",
"Gants Hill", "Grange Hill", "Greenford", "Hainault", "Hanger Lane",
"Holborn", "Holland Park", "Lancaster Gate", "Leyton", "Leytonstone",
"Liverpool Street", "Loughton", "Marble Arch", "Mile End",
"Newbury Park", "North Acton", "North Acton Junction", "Northolt",
"Notting Hill Gate", "Oxford Circus", "Perivale", "Queensway",
"Redbridge", "Roding Valley", "Ruislip Gardens", "Shepherd's Bush",
"Shepherds Bush (Central Line)", "Snaresbrook", "South Ruislip",
"South Woodford", "St. Paul's", "Stratford", "Theydon Bois",
"Tottenham Court Road", "Wanstead", "West Acton", "White City",
"Woodford"), class = "factor"), traveltime = c(433, 1102,
161, 584, 891, 829, 1473, 2273, 629, 2942, 467, 2001, 2424,
75, 351, 165, 249, 1840, 2731, 1148, 1289, 1653, 580, 1224,
2024, 380, 2693, 218, 1752, 2175), departuretime = structure(list(
sec = c(20, 20, 20, 20, 20, 40, 40, 40, 40, 40, 40, 40,
40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 49, 49, 49, 49,
49, 49, 49, 49), min = c(17L, 17L, 17L, 17L, 17L, 46L,
46L, 46L, 46L, 46L, 46L, 46L, 46L, 46L, 46L, 46L, 46L,
46L, 46L, 46L, 46L, 46L, 50L, 50L, 50L, 50L, 50L, 50L,
50L, 50L), hour = c(18L, 18L, 18L, 18L, 18L, 17L, 17L,
17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L,
17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L,
17L), mday = c(24L, 24L, 24L, 24L, 24L, 24L, 24L, 24L,
24L, 24L, 24L, 24L, 24L, 24L, 24L, 24L, 24L, 24L, 24L,
24L, 24L, 24L, 24L, 24L, 24L, 24L, 24L, 24L, 24L, 24L
), mon = c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L), year = c(114L, 114L, 114L, 114L,
114L, 114L, 114L, 114L, 114L, 114L, 114L, 114L, 114L,
114L, 114L, 114L, 114L, 114L, 114L, 114L, 114L, 114L,
114L, 114L, 114L, 114L, 114L, 114L, 114L, 114L), wday = c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L), yday = c(82L, 82L, 82L, 82L, 82L, 82L, 82L, 82L,
82L, 82L, 82L, 82L, 82L, 82L, 82L, 82L, 82L, 82L, 82L,
82L, 82L, 82L, 82L, 82L, 82L, 82L, 82L, 82L, 82L, 82L
), isdst = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L)), .Names = c("sec", "min", "hour",
"mday", "mon", "year", "wday", "yday", "isdst"), class = c("POSIXlt",
"POSIXt"), tzone = "GMT"), arrivaltime = structure(list(sec = c(33,
42, 1, 4, 11, 29, 13, 33, 9, 42, 27, 1, 4, 55, 31, 25, 49,
20, 11, 48, 9, 13, 29, 13, 33, 9, 42, 27, 1, 4), min = c(24L,
35L, 20L, 27L, 32L, 0L, 11L, 24L, 57L, 35L, 54L, 20L, 27L,
47L, 52L, 49L, 50L, 17L, 32L, 5L, 8L, 14L, 0L, 11L, 24L,
57L, 35L, 54L, 20L, 27L), hour = c(18L, 18L, 18L, 18L, 18L,
18L, 18L, 18L, 17L, 18L, 17L, 18L, 18L, 17L, 17L, 17L, 17L,
18L, 18L, 18L, 18L, 18L, 18L, 18L, 18L, 17L, 18L, 17L, 18L,
18L), mday = c(24L, 24L, 24L, 24L, 24L, 24L, 24L, 24L, 24L,
24L, 24L, 24L, 24L, 24L, 24L, 24L, 24L, 24L, 24L, 24L, 24L,
24L, 24L, 24L, 24L, 24L, 24L, 24L, 24L, 24L), mon = c(2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L),
year = c(114L, 114L, 114L, 114L, 114L, 114L, 114L, 114L,
114L, 114L, 114L, 114L, 114L, 114L, 114L, 114L, 114L,
114L, 114L, 114L, 114L, 114L, 114L, 114L, 114L, 114L,
114L, 114L, 114L, 114L), wday = c(1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), yday = c(82L,
82L, 82L, 82L, 82L, 82L, 82L, 82L, 82L, 82L, 82L, 82L,
82L, 82L, 82L, 82L, 82L, 82L, 82L, 82L, 82L, 82L, 82L,
82L, 82L, 82L, 82L, 82L, 82L, 82L), isdst = c(0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L
)), .Names = c("sec", "min", "hour", "mday", "mon", "year",
"wday", "yday", "isdst"), class = c("POSIXlt", "POSIXt"), tzone = "GMT"),
lcid = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L), .Label = c("1000001", "1000002", "1000003",
"1000004", "1000005", "1000006", "1000007", "1000008", "1000009",
"1000010", "1000045", "1000054", "1000070", "1000088", "1000089",
"1000090", "1000097", "1000098", "1000099", "1000100", "1000101",
"1000102", "1000103", "1000104", "1000105", "1000106", "1000107",
"1000109"), class = "factor"), tripno = structure(c(2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("1",
"10", "11", "12", "13", "14", "15", "16", "17", "18", "19",
"2", "20", "21", "22", "23", "24", "3", "4", "5", "6", "7",
"8", "81", "9"), class = "factor"), destination = structure(c(3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), .Label = c("Debden",
"Ealing Broadway", "Epping", "Grange Hill via Woodford",
"Hainault", "Hainault via Newbury Park", "Hainault via Woodford",
"Leytonstone", "Loughton", "Marble Arch", "Newbury Park",
"North Acton", "Northolt", "Ruislip Gardens", "West Ruislip",
"White City", "Woodford", "Woodford Via Hainault"), class = "factor"),
line = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L), .Label = "C", class = "factor")), .Names = c("depart",
"arrival", "traveltime", "departuretime", "arrivaltime", "lcid",
"tripno", "destination", "line"), row.names = c(NA, 30L), class = "data.frame")

Here a vectorized version of your code. Basically I used Map to avoid double loops and filling the result manullay (using rbind, very solw).
cutOffTopPercent <-
function(data,percent=0.99){
cut_off_dep_arr <-
function(dep,arr){
dataSubset <- data[data$depart == dep & data$arrival == arr,]
upperBorder <- getTopPercentileBottom(dataSubset, percent)
dataSubset[dataSubset$traveltime <= upperBorder,] ## <= not <
}
Map(cut_off_dep_arr,df$depart,df$arrival)
}
cutOffTopPercent(data=df)

Related

Table from results of sknn function (klaR package) won't output

I have a data set with 100 observations of 6 variables that I'm trying to run the sknn function on and then output a table of the results. I have updated the response variable to a factor to use as row and column headers in the table, and checked the data types of all other variables to make sure they are compatible (int and num). For some reason, no matter what I try, R freezes trying to pull the table of results and eventually crashes.
Here is the code I'm using:
loans_k$action_taken <- as.factor(loans_k$action_taken)
loans_k.knn <- sknn(loans_k$action_taken ~ ., data = loans_k, kn = 6)
loans_knntab <- table(loans_k$action_taken, predict(loans_k.knn)$class)
The data I'm using is:
> dput(loans_k)
structure(list(action_taken = structure(c(1L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 1L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 1L, 1L,
2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("0",
"1"), class = "factor"), loan_type = c(1L, 2L, 1L, 1L, 1L, 1L,
1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), loan_purpose = c(2L,
1L, 4L, 1L, 31L, 31L, 32L, 1L, 1L, 2L, 1L, 31L, 1L, 1L, 1L, 1L,
1L, 32L, 1L, 1L, 31L, 2L, 31L, 4L, 32L, 32L, 1L, 32L, 2L, 31L,
1L, 32L, 32L, 1L, 5L, 2L, 32L, 31L, 1L, 1L, 31L, 32L, 31L, 32L,
32L, 31L, 1L, 1L, 32L, 31L, 31L, 1L, 31L, 1L, 1L, 1L, 31L, 32L,
4L, 4L, 4L, 4L, 1L, 31L, 31L, 31L, 32L, 1L, 1L, 1L, 31L, 4L,
2L, 31L, 2L, 4L, 31L, 1L, 32L, 1L, 31L, 32L, 1L, 31L, 1L, 31L,
1L, 2L, 1L, 31L, 31L, 4L, 31L, 1L, 2L, 31L, 32L, 32L, 31L, 2L
), loan_to_value_ratio = c(63.2, 102.23, 55.98, 95, 75, 59.719,
24.038, 56.111, 98.02, 50, 80, 73.642, 96.5, 75, 73.723, 75,
63.141, 78.328, 66.32, 95, 61.667, 80, 81.944, 80, 58.685, 75.955,
85, 80, 50.144, 84.848, 80, 55.671, 60, 100, 64, 87.511, 48.541,
73.735, 102.16, 80, 61.62, 78.947, 44.64, 55.556, 26.846, 40.425,
102.391, 47.71, 51.333, 89.82, 78.949, 36.32, 55.441, 96.5, 95,
28.704, 74.199, 67.33, 77.84, 97.326, 26.75, 102, 80, 47.193,
76.27, 22.096, 72.759, 75, 97, 95, 67.942, 35.86, 84.04, 80,
92.25, 83.06, 85, 53.58, 75, 80, 54.415, 64.947, 100, 82, 80,
91.34, 95, 82.15, 100.514, 57.896, 71.42, 100, 59.905, 93.103,
79.91, 79.72, 60, 43.9, 79.772, 73.58), loan_term = c(60L, 360L,
240L, 360L, 348L, 360L, 180L, 360L, 360L, 60L, 360L, 312L, 360L,
360L, 360L, 360L, 240L, 360L, 180L, 360L, 360L, 360L, 360L, 240L,
180L, 360L, 360L, 360L, 120L, 360L, 360L, 360L, 360L, 360L, 240L,
240L, 360L, 360L, 360L, 360L, 240L, 360L, 60L, 180L, 120L, 180L,
360L, 360L, 120L, 360L, 360L, 360L, 360L, 360L, 360L, 180L, 360L,
360L, 360L, 60L, 360L, 360L, 360L, 180L, 360L, 120L, 360L, 360L,
360L, 360L, 180L, 120L, 360L, 360L, 60L, 240L, 360L, 240L, 360L,
360L, 360L, 360L, 360L, 360L, 360L, 360L, 360L, 120L, 360L, 360L,
360L, 360L, 360L, 360L, 240L, 360L, 240L, 240L, 360L, 361L),
debt_to_income_ratio = c(30L, 50L, 36L, 38L, 38L, 48L, 20L,
49L, 36L, 20L, 39L, 44L, 42L, 30L, 20L, 30L, 44L, 50L, 44L,
36L, 30L, 30L, 39L, 38L, 46L, 36L, 30L, 36L, 30L, 36L, 36L,
49L, 20L, 36L, 36L, 60L, 41L, 30L, 36L, 41L, 30L, 41L, 20L,
38L, 60L, 20L, 44L, 42L, 36L, 20L, 41L, 30L, 30L, 45L, 36L,
42L, 42L, 39L, 36L, 50L, 36L, 46L, 36L, 36L, 50L, 30L, 43L,
36L, 37L, 50L, 30L, 20L, 30L, 42L, 36L, 60L, 36L, 30L, 36L,
20L, 49L, 30L, 40L, 30L, 47L, 49L, 43L, 30L, 30L, 30L, 36L,
41L, 36L, 37L, 30L, 20L, 42L, 38L, 30L, 36L)), row.names = c(NA,
100L), class = "data.frame")
How can I get R to output the desired table?

Why is prediction error discrete in adabag?

I've got the table of 55 observations with 5 variables (F,H,R,T,U) and 1 classifier variable ("Group") in which I have two groups.
I'm doing data sampling by splitting the data into the training set (70%) and test set (30%). Then I run adaboosting and check how it works.
I want to get the adaboost error distribution for 100 samplings. But the distribution occurs to be discrete, outputting only five value variants: 0, 0.0588235294117647, 0.117647058823529 0.176470588235294 and 0.235294117647059.It doesn't change with mfinal argument. I guess there should be more! How it works?
I use the folowing code:
predictions<-list()
for (i in 1:100){
train.ind<-sample(nrow(df), nrow(df) * 0.7)
assign(paste0("ada",i), do.call(boosting,
c(formula=Group~F + H + R + T + U,
data=substitute(df[train.ind,]), mfinal=50, boos=FALSE,
coeflearn='Breiman'),envir = parent.frame()))
assign(paste0("pred",i), predict(ada,df[-train.ind,]))
predictions[[i]]<-get(paste0("pred",i))$error
}
hist(100*unlist(predictions),breaks=10,
main="Error probability [%] ntrees=10. 100 sampling operations", xlab="AdaBoost error")
dput(df)
structure(list(Group = structure(c(2L, 2L, 2L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L
), .Label = c("Canines", "Sled"), class = "factor"), F = c(0.263150566678734,
0.260347316635598, 0.26437277258488, 0.265710057607949, 0.254866055219663,
0.263294264681227, 0.261901194801303, 0.257318268395066, 0.26420207103455,
0.252093225560912, 0.255473253732324, 0.259067858940115, 0.259528043446917,
0.267331491048901, 0.260246447333382, 0.26035486437815, 0.254553215708594,
0.274074579975413, 0.262896904742862, 0.260504330262876, 0.258329960879536,
0.262664861154909, 0.256148832094211, 0.258509128895957, 0.256292083925698,
0.262358651734143, 0.254578103664353, 0.255386025800537, 0.264120912009577,
0.275232714712253, 0.265375720277527, 0.267601768121804, 0.262932226832642,
0.263633189245163, 0.262826186070212, 0.261058637786334, 0.262979366135887,
0.259232168979912, 0.252933156025384, 0.263963451214447, 0.258511197058683,
0.261957295373665, 0.253412282699461, 0.260748166588172, 0.263136039863289,
0.255317062006506, 0.258822015633545, 0.252757763183064, 0.260840486010478,
0.258620689655172, 0.263738813871524, 0.26241134751773, 0.26405425581719,
0.263685152057245, 0.262062787572784), H = c(0.242711147002311,
0.243850477245014, 0.245132979060713, 0.241794831140003, 0.235370262206577,
0.241392449436832, 0.236787894677703, 0.240434935369935, 0.234076675284456,
0.236978505926275, 0.23489414817613, 0.236461115627298, 0.241377100655228,
0.240778565421122, 0.238954656595734, 0.237237027626932, 0.23562891291975,
0.228247507171151, 0.235543469567304, 0.238348073568565, 0.237639956832591,
0.237993655975811, 0.23053394888479, 0.237553985998722, 0.238716430501961,
0.241044553515742, 0.23579805839771, 0.244646715997643, 0.245211405561299,
0.248463204730402, 0.237910443860818, 0.23772859908127, 0.242517289073306,
0.230376515634971, 0.239386381312522, 0.242971498213445, 0.248246377553633,
0.245227816034538, 0.237968589560153, 0.235998092571798, 0.235639593181493,
0.240320284697509, 0.239383587641388, 0.237939850635807, 0.240409493084614,
0.239705089012767, 0.235291279312896, 0.237725562711216, 0.251017166425148,
0.244410329082034, 0.247581475626206, 0.244082639531298, 0.248022977743474,
0.246127343801762, 0.246345535241663), R = c(0.23238005068085,
0.233913128793082, 0.232906768805408, 0.234580624702711, 0.23729616240706,
0.232552468336102, 0.23566425708828, 0.233370934038501, 0.23413197660754,
0.241255572873247, 0.240609653949119, 0.233790113420818, 0.239086204963073,
0.233644719452121, 0.23849468613068, 0.236846146329206, 0.239755264655663,
0.225925420024587, 0.239355887920232, 0.237429996633718, 0.23819641170916,
0.232039177131833, 0.223832380603256, 0.235838907338977, 0.236669843303285,
0.234916072348618, 0.238304558463179, 0.235904655883701, 0.232124394623714,
0.222879222527955, 0.233232723139038, 0.233871666714818, 0.235947441217151,
0.242585880964708, 0.234693056561268, 0.233941777691605, 0.229366135886539,
0.23539800906269, 0.239803390172875, 0.236505714593364, 0.24647853698133,
0.235569395017794, 0.242526379716086, 0.236207360559779, 0.234180854122081,
0.240408036487878, 0.239601762794737, 0.245058343429191, 0.234449894103222,
0.237875925051173, 0.230698942666106, 0.233475177304965, 0.231384358432554,
0.233114688928642, 0.230655428424067), T = c(0.261758235638105,
0.261889077326307, 0.257587479549, 0.257914486549337, 0.272467520166701,
0.262760817545838, 0.265646653432713, 0.268875862196498, 0.267589277073454,
0.269672695639567, 0.269022944142428, 0.270680912011768, 0.260008650934782,
0.258245224077857, 0.262304209940204, 0.265561961665713, 0.270062606715993,
0.271752492828849, 0.262203737769602, 0.263717599534841, 0.265833670578713,
0.267302305737446, 0.289484838417743, 0.268097977766344, 0.268321642269056,
0.261680722401497, 0.271319279474757, 0.264062602318119, 0.258543287805409,
0.253424858029389, 0.263481112722616, 0.260797966082108, 0.258603042876902,
0.263404414155158, 0.263094376055998, 0.262028086308617, 0.259408120423941,
0.26014200592286, 0.269294864241588, 0.263532741620391, 0.259370672778494,
0.262153024911032, 0.264677749943065, 0.265104622216242, 0.262273612930016,
0.264569812492848, 0.266284942258822, 0.264458330676529, 0.253692453461153,
0.25909305621162, 0.257980767836164, 0.260030835646007, 0.256538408006782,
0.25707281521235, 0.260936248761486), U = c(0.276642254462421,
0.275750907536407, 0.274138521440258, 0.279385339041277, 0.283770344294126,
0.273124933319108, 0.276770665567999, 0.272796198013943, 0.273326789343435,
0.278824893979485, 0.282917535762971, 0.269035729493284, 0.276381346021371,
0.275681845488406, 0.280473043309851, 0.274957072857482, 0.279453614114969,
0.265400901516186, 0.284438401450319, 0.275270067631668, 0.277080803992985,
0.268341093323935, 0.26334299428362, 0.27494270078114, 0.277070411973316,
0.276364671746617, 0.277622940087166, 0.275489489882784, 0.275412200032649,
0.267636555236813, 0.275475938484053, 0.27914367434201, 0.281161825726141,
0.287341513046201, 0.274277898463271, 0.272041104617345, 0.268317034458041,
0.277054269097656, 0.276448903327891, 0.282483963758864, 0.288513266166897,
0.280409252669039, 0.283610415243301, 0.27874587902846, 0.274619094771137,
0.275604453090517, 0.286100299160421, 0.288513039597016, 0.270078586556683,
0.280480764184118, 0.274123602187187, 0.277940178846747, 0.273784368554907,
0.282369310276287, 0.277372857201026)), na.action = structure(c(`2` = 2L,
`4` = 4L, `19` = 18L, `24` = 20L, `28` = 24L, `29` = 25L, `30` = 26L,
`32` = 28L, `33` = 29L, `42` = 38L, `54` = 46L, `69` = 54L, `74` = 58L,
`77` = 59L, `79` = 60L, `80` = 61L, `83` = 62L), class = "omit"), row.names = c(5L,
6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 15L, 16L, 17L, 18L, 20L,
25L, 26L, 27L, 31L, 41L, 44L, 46L, 47L, 48L, 50L, 51L, 52L, 55L,
57L, 64L, 65L, 66L, 67L, 68L, 70L, 71L, 72L, 85L, 86L, 87L, 88L,
89L, 90L, 91L, 92L, 93L, 94L, 95L, 96L, 97L, 98L, 99L, 100L,
101L, 102L, 103L), class = "data.frame")

Speeding up a loop (extracting specific values from a data frame)

My task is to extract all values in a column "2" after sorting by factor level in another column "3" (for the interested, i am sorting fasta sequences by organism). I am using this very simple code to get what i need.
df <- read.table("outfile.txt", fill=T) # the original output file includes many empty cells
# df is availabe at the bottom of this post
# splitting by factors
list1 <- split(df, df$V3)
# extract all values in column 2
list2 <- lapply(list1, function(x) as.data.frame(x$V2))
# writing results to file
for (x in names(list2))
write.table(list2[[x]], file=paste(x,".txt"), quote=F, row.names = F, col.names=F)
The works well on a small df. However, the output file contains several gigabytes of data. I tried a subset (500,000 rows on my local machine with 8GB RAM), but the second command is extremely slow (or R just hangs).
So i wondered and am asking the community, if there is a better way to solve this. Thank you.
Here is df:
dput(df)
structure(list(V1 = structure(c(1L, 2L, 1L, 1L, 1L, 1L, 1L, 2L,
1L, 2L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 1L,
1L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 2L, 2L,
2L, 1L, 1L, 1L, 1L, 1L), .Label = c("C", "U"), class = "factor"),
V2 = structure(c(10L, 2L, 27L, 29L, 25L, 32L, 28L, 39L, 40L,
22L, 8L, 7L, 19L, 38L, 15L, 3L, 16L, 26L, 34L, 13L, 17L,
18L, 14L, 41L, 44L, 12L, 45L, 46L, 5L, 1L, 31L, 4L, 37L,
11L, 43L, 20L, 21L, 30L, 23L, 35L, 24L, 42L, 9L, 33L, 36L,
6L), .Label = c("M02978:20:000000000-B8C4P:1:1101:11008:4137",
"M02978:20:000000000-B8C4P:1:1101:14389:3444", "M02978:20:000000000-B8C4P:1:1101:14986:3769",
"M02978:20:000000000-B8C4P:1:1101:15333:4161", "M02978:20:000000000-B8C4P:1:1101:15438:4092",
"M02978:20:000000000-B8C4P:1:1101:15516:4514", "M02978:20:000000000-B8C4P:1:1101:16313:3660",
"M02978:20:000000000-B8C4P:1:1101:16433:3650", "M02978:20:000000000-B8C4P:1:1101:16663:4462",
"M02978:20:000000000-B8C4P:1:1101:17179:3407", "M02978:20:000000000-B8C4P:1:1101:17779:4225",
"M02978:20:000000000-B8C4P:1:1101:18008:3981", "M02978:20:000000000-B8C4P:1:1101:18047:3851",
"M02978:20:000000000-B8C4P:1:1101:18920:3936", "M02978:20:000000000-B8C4P:1:1101:19086:3737",
"M02978:20:000000000-B8C4P:1:1101:19203:3783", "M02978:20:000000000-B8C4P:1:1101:19335:3908",
"M02978:20:000000000-B8C4P:1:1101:19520:3921", "M02978:20:000000000-B8C4P:1:1101:19612:3701",
"M02978:20:000000000-B8C4P:1:1101:19655:4289", "M02978:20:000000000-B8C4P:1:1101:19918:4313",
"M02978:20:000000000-B8C4P:1:1101:20321:3602", "M02978:20:000000000-B8C4P:1:1101:21089:4350",
"M02978:20:000000000-B8C4P:1:1101:22293:4406", "M02978:20:000000000-B8C4P:1:1101:22453:3490",
"M02978:20:000000000-B8C4P:1:1101:23026:3811", "M02978:20:000000000-B8C4P:1:1101:23065:3472",
"M02978:20:000000000-B8C4P:1:1101:23770:3507", "M02978:20:000000000-B8C4P:1:1101:23991:3472",
"M02978:20:000000000-B8C4P:1:1101:24290:4332", "M02978:20:000000000-B8C4P:1:1101:24415:4142",
"M02978:20:000000000-B8C4P:1:1101:25066:3498", "M02978:20:000000000-B8C4P:1:1101:25678:4466",
"M02978:20:000000000-B8C4P:1:1101:25992:3830", "M02978:20:000000000-B8C4P:1:1101:26431:4388",
"M02978:20:000000000-B8C4P:1:1101:26573:4479", "M02978:20:000000000-B8C4P:1:1101:5567:4179",
"M02978:20:000000000-B8C4P:1:1101:6323:3723", "M02978:20:000000000-B8C4P:1:1101:6675:3536",
"M02978:20:000000000-B8C4P:1:1101:6868:3559", "M02978:20:000000000-B8C4P:1:1101:7078:3965",
"M02978:20:000000000-B8C4P:1:1101:8145:4431", "M02978:20:000000000-B8C4P:1:1101:8449:4257",
"M02978:20:000000000-B8C4P:1:1101:8592:3966", "M02978:20:000000000-B8C4P:1:1101:9468:4026",
"M02978:20:000000000-B8C4P:1:1101:9970:4051"), class = "factor"),
V3 = c(926550L, 0L, 1121396L, 406818L, 1265505L, 1167006L,
1121399L, 0L, 177437L, 0L, 1536652L, 0L, 1196029L, 0L, 1178540L,
138119L, 0L, 1536652L, 186802L, 0L, 1322246L, 1232437L, 1196029L,
1121396L, 452637L, 0L, 0L, 0L, 1541959L, 1121403L, 96561L,
1167006L, 767528L, 0L, 0L, 653733L, 1423815L, 857293L, 0L,
0L, 0L, 468059L, 1167006L, 1232437L, 880073L, 761193L), V4 = c(171L,
NA, 264L, 88L, 356L, 257L, 128L, NA, 97L, NA, 243L, NA, 96L,
NA, 80L, 93L, NA, 138L, 155L, NA, 243L, 262L, 77L, 470L,
135L, NA, NA, NA, 124L, 161L, 211L, 202L, 91L, NA, NA, 146L,
98L, 93L, NA, NA, NA, 107L, 382L, 247L, 130L, 157L), V5 = structure(c(25L,
1L, 2L, 17L, 9L, 5L, 3L, 1L, 16L, 1L, 14L, 1L, 7L, 1L, 6L,
11L, 1L, 14L, 24L, 1L, 10L, 8L, 7L, 2L, 18L, 1L, 1L, 1L,
15L, 4L, 26L, 5L, 13L, 1L, 1L, 20L, 12L, 22L, 1L, 1L, 1L,
19L, 5L, 8L, 23L, 21L), .Label = c("", "1121396,", "1121399,",
"1121403,", "1167006,", "1178540,", "1196029,", "1232437,",
"1265505,", "1322246,", "138119,", "1423815,", "1460634,1460635,",
"1536652,", "1541959,", "177437,", "406818,", "452637,",
"468059,", "653733,", "761193,", "857293,", "880073,", "883109,888727,1161902,1230734,1392487,",
"926550,", "96561,"), class = "factor")), .Names = c("V1",
"V2", "V3", "V4", "V5"), class = "data.frame", row.names = c(NA,
-46L))
using data.table package combined with write.table.
order by V3 and then write the V2 columns separately for each group in V3.
library('data.table')
setDT(df)[ order(V3), write.table(V2, file = paste0( V3, ".txt")), by = V3]
This worked for me but I cannot speak for how fast it would be on your machine.
lapply(unique(df$V3), function(x) write.table(df[which(df$V3 == x),]$V2, file = paste(x, ".txt", sep = ""), quote = FALSE, row.names = FALSE, col.names = FALSE))

R: aggregate based of multiple columns

How can I sum DBH and Basal area by Tree.Species Compartment, Stand, Transect and Plot?
Tree.Species DBH Basal Area Compartment Stand Transect Plot
Sugar Maple 16.4 211.1336 107 20 2 3
Sugar Maple 25.1 494.55785 107 20 2 3
Hemlock 15.1 178.98785 209 30 1 2
I was trying:
aggregate(.~ Compartment + Stand + Transect + Plot + Tree.Speices, data = Trees, FUN = sum)
but I keep getting this error:
Error in model.frame.default(formula = cbind(DBH, Basal.Area, Transect.., :
variable lengths differ (found for 'Transect')
My variables unique values and there are not all possible combinations in the data.
Compartment: 107 209 310 231
Stand: 20 110 30 240 80 300
Transect: 1 2 3
Plot: 1 2 3 4 5 6
dput:
structure(list(Tree.Speices = structure(c(53L, 49L, 49L, 49L,
49L, 11L, 49L, 12L, 49L, 4L, 49L, 49L, 49L, 53L, 49L, 49L, 4L,
4L, 33L, 4L, 11L, 53L, 11L, 53L, 53L, 21L, 21L, 53L, 49L, 53L,
49L, 49L, 53L, 21L, 4L, 4L, 49L, 12L, 21L, 49L, 49L, 49L, 49L,
9L, 49L, 49L, 11L, 11L, 53L, 47L, 33L, 11L, 5L, 49L, 11L, 11L,
38L, 11L, 49L, 11L, 11L, 11L, 11L, 49L, 53L, 53L, 53L, 47L, 49L,
49L, 49L, 47L, 49L, 33L, 4L, 4L, 47L, 4L, 11L, 49L, 53L, 49L,
11L, 11L, 11L, 49L, 11L, 11L, 11L, 11L, 11L, 49L, 11L, 49L, 47L,
49L, 11L, 11L, 11L, 11L, 21L), .Label = c("American Elm", "Aspen",
"Balsam", "Basswood", "Beech", "Big-Toothed Aspen", "Black Cherry",
"Cedar", "Cottonwood", "Elm", "Hemlock", "Hop Hornbeam", "paper birch",
"Paper Birch", "Poplar", "Quaking Aspen", "Red Maple", "Red Oak",
"Red Spruce", "snag", "Snag", "Snag (Aspen)", "Snag (Basswood)",
"Snag (Beech)", "Snag (Big-Toothed Aspen)", "SNAG (Big-Toothed Aspen)",
"snag (conifer)", "Snag (Conifer)", "Snag (Cottonwood)", "Snag (Elm)",
"Snag (hardwood)", "snag (Hemlock)", "Snag (Hemlock)", "SNAG (Hemlock)",
"Snag (maple)", "Snag (Maple)", "Snag (Oak)", "Snag (Paper Birch)",
"Snag (Poplar)", "Snag (Red Maple)", "snag (Sugar Maple)", "Snag (Sugar Maple)",
"Snag (Sugar)", "Snag (White Ash)", "Snag (White Pine)", "snag (Yellow Birch)",
"Snag (Yellow Birch)", "SNAG (Yellow Birch)", "Sugar Maple",
"White Ash", "White Birch", "White Pine", "Yellow Birch"), class = "factor"),
DBH = c(55.7, 21.3, 14, 38, 6.5, 20.3, 33.2, 6.3, 30.5, 22.3,
32.7, 8.9, 41.8, 30, 24.6, 13.8, 56.6, 49.5, 49.9, 63.2,
28, 39, 21, 25.7, 29.9, 38, 17.5, 22.4, 6.2, 20.3, 18.3,
21, 24.7, 49.5, 6.4, 30.3, 19.4, 6, 37.8, 24.6, 24.4, 9.5,
17.4, 49.2, 26, 31, 23.6, 19.8, 37.9, 25.8, 31.5, 18.1, 34.4,
59.7, 28.2, 21, 16.4, 23.7, 34.4, 24.7, 15.4, 12, 24.2, 34.2,
19.4, 15.1, 34.9, 34.8, 6.6, 61.2, 25.4, 38.8, 28.9, 32.3,
43.9, 33.8, 27.1, 37, 21.2, 26.4, 27.4, 10.6, 55.1, 69.4,
24, 25.4, 51, 20.2, 14.3, 31.8, 48.8, 38.3, 19.6, 26.3, 34.5,
6.3, 41.3, 32.6, 14.6, 9.1, 57.8), Basal.Area = c(2435.45465,
356.14665, 153.86, 1133.54, 33.16625, 323.49065, 865.2584,
31.15665, 730.24625, 390.37265, 839.39265, 62.17985, 1371.5834,
706.5, 475.0506, 149.4954, 2514.7946, 1923.44625, 1954.65785,
3135.4784, 615.44, 1193.985, 346.185, 518.48465, 701.79785,
1133.54, 240.40625, 393.8816, 30.1754, 323.49065, 262.88865,
346.185, 478.92065, 1923.44625, 32.1536, 720.70065, 295.4426,
28.26, 1121.6394, 475.0506, 467.3576, 70.84625, 237.6666,
1900.2024, 530.66, 754.385, 437.2136, 307.7514, 1127.58185,
522.5274, 778.91625, 257.17385, 928.9376, 2797.81065, 624.2634,
346.185, 211.1336, 440.92665, 928.9376, 478.92065, 186.1706,
113.04, 459.7274, 918.1674, 295.4426, 178.98785, 956.13785,
950.6664, 34.1946, 2940.1704, 506.4506, 1181.7704, 655.63985,
818.98265, 1512.85985, 896.8154, 576.51185, 1074.665, 352.8104,
547.1136, 589.3466, 88.2026, 2383.26785, 3780.8426, 452.16,
506.4506, 2041.785, 320.3114, 160.52465, 793.8234, 1869.4304,
1151.50865, 301.5656, 542.97665, 934.34625, 31.15665, 1338.96665,
834.2666, 167.3306, 65.00585, 2622.5594), Compartment = c(107L,
107L, 107L, 107L, 107L, 107L, 107L, 107L, 107L, 107L, 107L,
107L, 107L, 107L, 107L, 107L, 107L, 107L, 107L, 107L, 107L,
107L, 107L, 107L, 107L, 107L, 107L, 107L, 107L, 107L, 107L,
107L, 107L, 107L, 107L, 107L, 107L, 107L, 107L, 107L, 107L,
107L, 107L, 107L, 107L, 107L, 107L, 107L, 107L, 107L, 107L,
107L, 107L, 107L, 107L, 107L, 107L, 107L, 107L, 107L, 107L,
107L, 107L, 107L, 107L, 107L, 107L, 107L, 107L, 107L, 107L,
107L, 107L, 107L, 107L, 107L, 107L, 107L, 107L, 107L, 107L,
107L, 107L, 107L, 107L, 107L, 107L, 107L, 107L, 107L, 107L,
107L, 107L, 107L, 107L, 107L, 107L, 107L, 107L, 107L, 107L
), Stand = c(20L, 20L, 20L, 20L, 20L, 20L, 20L, 20L, 20L,
20L, 20L, 20L, 20L, 20L, 20L, 20L, 20L, 20L, 20L, 20L, 20L,
20L, 20L, 20L, 20L, 20L, 20L, 20L, 20L, 20L, 20L, 20L, 20L,
20L, 20L, 20L, 20L, 20L, 20L, 20L, 20L, 20L, 20L, 20L, 20L,
20L, 20L, 20L, 20L, 20L, 20L, 20L, 20L, 20L, 20L, 20L, 20L,
20L, 20L, 20L, 20L, 20L, 20L, 20L, 20L, 20L, 20L, 20L, 20L,
20L, 20L, 20L, 20L, 20L, 20L, 20L, 20L, 20L, 20L, 20L, 20L,
20L, 20L, 20L, 20L, 20L, 20L, 20L, 20L, 20L, 20L, 20L, 20L,
20L, 20L, 20L, 20L, 20L, 20L, 20L, 20L), Transect.. = c(2L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), Plot.. = c(1L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L)), .Names = c("Tree.Speices",
"DBH", "Basal.Area", "Compartment", "Stand", "Transect..", "Plot.."
), row.names = 67:167, class = "data.frame")
The last two column names have two dots at the end and Species is incorrectly spelled:
> names(Trees)
[1] "Tree.Speices" "DBH" "Basal.Area" "Compartment" "Stand"
[6] "Transect.." "Plot.."
Try:
aggregate(.~ Compartment + Stand + Transect.. + Plot.. + Tree.Speices,
data = Trees, FUN = sum)
or remove the dots at the end of all names and correct the spelling:
names(Trees) <- sub("\\.+$", "", names(Trees))
names(Trees) <- sub("Speices", "Species", names(Trees))
aggregate(.~ Compartment + Stand + Transect + Plot + Tree.Species,
data = Trees, FUN = sum)

R tells me " object 'train' not found "

In my customized function I met a strange problem.
I'm writing a function to do cross-validation with logistic and clogit(in survival) regression.Thus I need to generate a training set and testing set.I've marked the part to do it.
I need to compare the classic logistic regression and the conditional logistic regression.So I use an 'if' statement to distinguish those two functions.
Here's the problem.It seems that the glm function can find the train vector and doing well,but clogit can't find it!Even if the train vector is output correctly.
When I test each line out of my function gcv,clogit works again.
Can somebody tell me why is clogit not working with train?
I called this function as:
gcv(as.numeric(FNDX)~HIGD+DEG+CHK+AGP1+AGMN+NLV+LIV+WT+AGLP+MST+strata(STR),bbdm,method="clogit")
and the error message is
Error in `[.data.frame`(bbdm, train, ) : object 'train' not found
Do you need traceback() information?
and the data set is bbdm13 in http://www.umass.edu/statdata/statdata/stat-logistic.html.
There are NA in the original data,or use the sample after the code :)
Related codes are as following:
gcv<-function(formula,data=NULL,method="rpart",cross=5,times=10,k=7,layer=5,seed=0)
{
set=data;
n=nrow(set);
set.seed(as.vector(Sys.time()));
bb1=1:n;
bb2=rep(1:cross,ceiling(n/cross))[1:n];
bb2=sample(bb2,n);
samp=sample(c(1:n),size=n);
m=ceiling(n/cross);
smp<-mat.or.vec(cross,m);
j=rep(0,cross)
for (i in 1:n)
{
smp[bb2[i],j[bb2[i]]]=i
j[bb2[i]]=j[bb2[i]]+1
}
# Here we separate the original set into 5(variable cross)sets,
# each time we take one out and treat it as the testing set
mf <- match.call(expand.dots = FALSE)
m <- match(c("formula","data"), names(mf), 0L)
mf <- mf[c(1L, m)]
mf$drop.unused.levels <- TRUE
mf[[1L]] <- as.name("model.frame")
mf <- eval(mf, parent.frame())
response<-model.response(mf)
#code copied from function.lm
reslvl<-length(levels(response))
tra<-mat.or.vec(reslvl,reslvl);
tes<-mat.or.vec(reslvl,reslvl);
for (i in 1:cross)
{
test<-smp[i,];
train<-setdiff(1:200,test);
show(train); #THe 'train' set can be shown here.
#some "if" and "else"statements are hidden
if (method=="logistic")#logistic is running well
{
bb.log<-step(glm(formula,set,family=binomial),trace=FALSE)
tra<-tra+as.vector(t(table(response[train],
bin(predict.glm(bb.log,set[train,],type="response")))))
tes<-tes+as.vector(t(table(response[test],
bin(predict.glm(bb.log,set[test,],type="response")))))
}
else if (method=="clogit")#clogit is meeting a problem.
{
library("survival")
bb.clog<-step(clogit(formula,bbdm[train,]),trace=FALSE)
tra<-tra+as.vector(t(table( response[train],
bin(predict(bb.clog,set[train,])))))
tes<-tes+as.vector(t(table( response[test],
bin(predict(bb.clog,set[test,])))))
}
}
tra<-tra/cross;
tes<-tes/cross;
trainrate=1-sum(diag(tra))/sum(tra)
testrate=1-sum(diag(tes))/sum(tes)
result<-list(Train=tra,TrainRate=trainrate,Test=tes,TestRate=testrate)
result
}
Sample Data:
STR OBS AGMT FNDX HIGD DEG CHK AGP1 AGMN NLV LIV WT AGLP MST
1 1 1 39 1 9 0 1 23 13 0 5 118 39 1
2 1 2 39 0 10 0 2 16 11 1 3 175 39 3
3 1 3 39 0 11 0 2 20 12 1 3 135 39 2
4 1 4 39 0 12 1 1 21 11 0 3 125 40 1
5 2 1 38 1 14 2 1 24 14 1 3 118 39 1
6 2 2 38 0 12 1 2 20 15 0 2 183 38 1
7 2 3 38 0 9 0 2 19 11 0 5 218 38 1
8 2 4 38 0 13 1 1 23 13 0 2 192 37 1
9 3 1 38 1 9 0 1 22 15 2 2 125 38 1
10 3 2 38 0 10 0 2 20 14 0 2 123 38 1
11 3 3 38 0 15 1 1 19 13 3 2 140 37 1
12 3 4 38 0 12 1 1 18 13 0 2 160 38 1
13 4 1 38 1 15 1 1 24 14 2 3 150 38 5
14 4 2 38 0 15 2 1 26 13 1 1 130 38 2
15 4 3 38 0 12 1 2 23 14 0 4 140 38 1
16 4 4 38 0 12 1 1 25 16 0 2 130 38 1
17 5 1 38 1 12 1 1 21 17 0 2 150 38 2
18 5 2 38 0 12 1 2 20 12 1 2 148 38 1
19 5 3 38 0 14 2 1 22 13 0 2 134 39 1
20 5 4 38 0 13 1 1 16 14 0 6 138 38 4
21 6 1 38 1 13 1 1 24 12 1 3 116 39 1
22 6 2 38 0 12 1 2 19 12 0 2 145 35 2
23 6 3 38 0 14 2 2 21 10 4 3 195 35 1
24 6 4 38 0 14 4 1 25 8 0 1 180 38 2
25 7 1 37 1 17 4 1 26 13 1 4 137 37 5
26 7 2 37 0 15 2 1 20 11 2 2 135 37 2
27 7 3 37 0 9 0 1 18 10 2 3 155 37 1
28 7 4 37 0 12 1 2 22 13 2 2 120 38 1
29 8 1 36 1 12 1 1 23 14 0 2 126 36 2
30 8 2 36 0 10 0 1 20 12 1 2 191 36 1
31 8 3 36 0 10 0 2 17 10 1 3 185 37 1
32 8 4 36 0 12 1 2 23 12 0 2 119 37 1
33 9 1 35 1 12 1 1 23 14 0 3 129 36 1
34 9 2 35 0 14 1 2 21 11 0 3 170 34 2
35 9 3 36 0 12 1 1 22 14 0 4 110 36 1
36 9 4 35 0 14 2 2 24 11 0 2 155 35 1
37 10 1 35 1 12 1 2 21 12 0 2 105 29 1
38 10 2 36 0 17 3 1 26 13 1 2 115 36 1
39 10 3 36 0 12 1 2 22 12 2 3 120 36 1
40 10 4 36 0 12 1 1 33 16 0 1 150 36 1
Structure:
structure(list(STR = structure(c(1L, 1L, 1L, 1L, 2L, 2L, 2L,
2L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L, 6L, 6L, 6L,
6L, 7L, 7L, 7L, 7L, 8L, 8L, 8L, 8L, 9L, 9L, 9L, 9L, 10L, 10L,
10L, 10L, 11L, 11L, 11L, 11L, 12L, 12L, 12L, 12L, 13L, 13L, 13L,
13L, 14L, 14L, 14L, 14L, 15L, 15L, 15L, 15L, 16L, 16L, 16L, 16L,
17L, 17L, 17L, 17L, 18L, 18L, 18L, 18L, 19L, 19L, 19L, 19L, 20L,
20L, 20L, 20L, 21L, 21L, 21L, 21L, 22L, 22L, 22L, 22L, 23L, 23L,
23L, 23L, 24L, 24L, 24L, 24L, 25L, 25L, 25L, 25L, 26L, 26L, 26L,
26L, 27L, 27L, 27L, 27L, 28L, 28L, 28L, 28L, 29L, 29L, 29L, 29L,
30L, 30L, 30L, 30L, 31L, 31L, 31L, 31L, 32L, 32L, 32L, 32L, 33L,
33L, 33L, 33L, 34L, 34L, 34L, 34L, 35L, 35L, 35L, 35L, 36L, 36L,
36L, 36L, 37L, 37L, 37L, 37L, 38L, 38L, 38L, 38L, 39L, 39L, 39L,
39L, 40L, 40L, 40L, 40L, 41L, 41L, 41L, 41L, 42L, 42L, 42L, 42L,
43L, 43L, 43L, 43L, 44L, 44L, 44L, 44L, 45L, 45L, 45L, 45L, 46L,
46L, 46L, 46L, 47L, 47L, 47L, 47L, 48L, 48L, 48L, 48L, 49L, 49L,
49L, 49L, 50L, 50L, 50L, 50L), .Label = c("1", "2", "3", "4",
"5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15",
"16", "17", "18", "19", "20", "21", "22", "23", "24", "25", "26",
"27", "28", "29", "30", "31", "32", "33", "34", "35", "36", "37",
"38", "39", "40", "41", "42", "43", "44", "45", "46", "47", "48",
"49", "50"), class = "factor"), OBS = structure(c(1L, 2L, 3L,
4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L,
4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L,
4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L,
4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L,
4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L,
4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L,
4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L,
4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L,
4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L,
4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L,
4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L,
4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L,
4L, 1L, 2L, 3L, 4L), .Label = c("1", "2", "3", "4"), class = "factor"),
AGMT = c(39L, 39L, 39L, 39L, 38L, 38L, 38L, 38L, 38L, 38L,
38L, 38L, 38L, 38L, 38L, 38L, 38L, 38L, 38L, 38L, 38L, 38L,
38L, 38L, 37L, 37L, 37L, 37L, 36L, 36L, 36L, 36L, 35L, 35L,
36L, 35L, 35L, 36L, 36L, 36L, 35L, 35L, 35L, 35L, 34L, 35L,
34L, 34L, 33L, 33L, 32L, 33L, 33L, 33L, 33L, 33L, 32L, 32L,
32L, 32L, 31L, 30L, 31L, 31L, 68L, 68L, 68L, 68L, 64L, 64L,
64L, 64L, 63L, 63L, 63L, 63L, 62L, 62L, 62L, 62L, 61L, 61L,
61L, 61L, 61L, 62L, 62L, 61L, 61L, 62L, 61L, 61L, 61L, 61L,
61L, 61L, 60L, 60L, 60L, 60L, 58L, 58L, 58L, 58L, 55L, 55L,
55L, 55L, 55L, 55L, 55L, 55L, 52L, 52L, 52L, 52L, 52L, 52L,
52L, 52L, 51L, 51L, 51L, 51L, 49L, 49L, 49L, 49L, 48L, 48L,
48L, 48L, 47L, 47L, 47L, 47L, 47L, 47L, 47L, 47L, 46L, 46L,
46L, 46L, 46L, 46L, 46L, 46L, 45L, 45L, 45L, 45L, 45L, 45L,
45L, 45L, 45L, 45L, 45L, 45L, 45L, 45L, 45L, 45L, 44L, 44L,
44L, 44L, 44L, 44L, 44L, 44L, 43L, 43L, 43L, 43L, 28L, 27L,
28L, 28L, 53L, 53L, 53L, 53L, 56L, 56L, 56L, 56L, 41L, 41L,
41L, 41L, 41L, 41L, 40L, 41L, 41L, 42L, 41L, 41L), FNDX = structure(c(2L,
1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L,
2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L,
1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L,
1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L,
1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L,
2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L,
1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L,
1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L,
1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L,
2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L,
1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L,
1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L,
1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L,
2L, 1L, 1L, 1L), .Label = c("0", "1"), class = "factor"),
HIGD = c(9L, 10L, 11L, 12L, 14L, 12L, 9L, 13L, 9L, 10L, 15L,
12L, 15L, 15L, 12L, 12L, 12L, 12L, 14L, 13L, 13L, 12L, 14L,
14L, 17L, 15L, 9L, 12L, 12L, 10L, 10L, 12L, 12L, 14L, 12L,
14L, 12L, 17L, 12L, 12L, 20L, 10L, 12L, 14L, 12L, 18L, 12L,
12L, 20L, 15L, 12L, 14L, 18L, 12L, 13L, 18L, 12L, 12L, 15L,
12L, 17L, 10L, 13L, 13L, 14L, 8L, 16L, 12L, 12L, 20L, 13L,
12L, 10L, 12L, 5L, 12L, 12L, 12L, 16L, 10L, 8L, 13L, 8L,
16L, 11L, 9L, 15L, 14L, 12L, 18L, 6L, 12L, 10L, 8L, 12L,
8L, 13L, 12L, 11L, 13L, 12L, 12L, 13L, 12L, 14L, 12L, 12L,
11L, 12L, 12L, 12L, 10L, 12L, 14L, 8L, 12L, 12L, 14L, 9L,
12L, 7L, 16L, 15L, 15L, 20L, 12L, 12L, 14L, 17L, 12L, 12L,
12L, 17L, 15L, 12L, 10L, 12L, 10L, 11L, 17L, 10L, 12L, 14L,
8L, 12L, 12L, 12L, 11L, 12L, 12L, 8L, 13L, 12L, 12L, 12L,
19L, 12L, 12L, 13L, 12L, 17L, 12L, 16L, 14L, 16L, 18L, 12L,
12L, 12L, 12L, 12L, 12L, 16L, 16L, 12L, 12L, 16L, 11L, 12L,
12L, 16L, 12L, 12L, 11L, 12L, 12L, 16L, 12L, 12L, 12L, 12L,
16L, 10L, 11L, 15L, 12L, 14L, 10L, 15L, 13L), DEG = structure(c(1L,
1L, 1L, 2L, 3L, 2L, 1L, 2L, 1L, 1L, 2L, 2L, 2L, 3L, 2L, 2L,
2L, 2L, 3L, 2L, 2L, 2L, 3L, 5L, 5L, 3L, 1L, 2L, 2L, 1L, 1L,
2L, 2L, 2L, 2L, 3L, 2L, 4L, 2L, 2L, 5L, 1L, 2L, 2L, 2L, 5L,
2L, 2L, 5L, 2L, 2L, 3L, 5L, 2L, 2L, 5L, 2L, 2L, 2L, 2L, 4L,
1L, 2L, 2L, 3L, 1L, 4L, 2L, 2L, 5L, 2L, 2L, 1L, 2L, 1L, 2L,
2L, 2L, 4L, 1L, 1L, 2L, 1L, 4L, 1L, 1L, 3L, 2L, 2L, 5L, 1L,
2L, 1L, 1L, 2L, 1L, 2L, 2L, 1L, 3L, 2L, 2L, 3L, 2L, 3L, 2L,
2L, 1L, 2L, 2L, 2L, 1L, 2L, 3L, 1L, 2L, 2L, 3L, 1L, 2L, 1L,
4L, 3L, 3L, 5L, 2L, 2L, 3L, 5L, 2L, 2L, 2L, 5L, 2L, 2L, 1L,
2L, 1L, 1L, 4L, 1L, 2L, 3L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 1L,
2L, 2L, 2L, 2L, 5L, 2L, 2L, 2L, 2L, 5L, 2L, 4L, 2L, 4L, 5L,
2L, 2L, 2L, 2L, 2L, 2L, 4L, 4L, 2L, 2L, 4L, 1L, 2L, 2L, 4L,
2L, 2L, 1L, 2L, 2L, 4L, 2L, 2L, 2L, 2L, 4L, 1L, 1L, 2L, 2L,
2L, 1L, 2L, 2L), .Label = c("0", "1", "2", "3", "4"), class = "factor"),
CHK = structure(c(1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L,
1L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 2L, 1L, 1L,
1L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 2L, 2L, 1L, 2L, 1L,
1L, 2L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 2L, 1L, 1L, 1L, 2L,
1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 1L,
2L, 2L, 2L, 1L, 1L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 1L,
1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 2L,
2L, 1L, 2L, 2L, 1L, 2L, 2L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 2L,
1L, 2L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 1L,
2L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 2L, 1L, 1L, 1L, 2L, 2L, 1L,
1L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 1L, 1L,
1L, 1L, 2L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 1L, 1L,
2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 2L), .Label = c("1",
"2"), class = "factor"), AGP1 = c(23, 16, 20, 21, 24, 20,
19, 23, 22, 20, 19, 18, 24, 26, 23, 25, 21, 20, 22, 16, 24,
19, 21, 25, 26, 20, 18, 22, 23, 20, 17, 23, 23, 21, 22, 24,
21, 26, 22, 33, 26, 18, 19, 21, 25, 27, 20, 25, 26, 21, 24,
25, 28, 21, 20, 21, 30, 25, 20, 23, 30, 21, 23, 24, 22, 34,
23, 19, 30, 28, 26, 25, 21, 24, 24, 24, 26, 26, 32, 22, 28,
26, 28, 27, 22, 30, 25, 26, 26, 33, 25, 29, 21, 18, 22, 23,
28, 25, 24, 33, 20, 25, 24, 24, 30, 30, 30, 24, 24, 23, 16,
26, 24, 28, 20, 25, 23, 21, 23, 20, 24, 24, 22, 24, 25, 25,
24, 25, 22, 22, 23, 19, 26, 20, 24, 22, 19, 23, 23, 21, 27,
19, 26, 15, 27, 23, 22, 17, 33, 25, 20, 22, 24, 23, 20, 30,
18, 22, 30, 22, 25, 23, 23, 23, 25, 27, 27, 25, 24, 22, 23,
18, 27, 31, 14, 20, 29, 22, 20, 23, 29, 28, 23, 26, 21, 27,
26, 25, 25, 20, 21, 22, 40, 21, 21, 26, 34, 21, 30, 21),
AGMN = c(13L, 11L, 12L, 11L, 14L, 15L, 11L, 13L, 15L, 14L,
13L, 13L, 14L, 13L, 14L, 16L, 17L, 12L, 13L, 14L, 12L, 12L,
10L, 8L, 13L, 11L, 10L, 13L, 14L, 12L, 10L, 12L, 14L, 11L,
14L, 11L, 12L, 13L, 12L, 16L, 11L, 13L, 11L, 12L, 10L, 13L,
11L, 16L, 14L, 11L, 12L, 12L, 14L, 12L, 13L, 13L, 13L, 11L,
9L, 16L, 14L, 14L, 11L, 13L, 12L, 14L, 13L, 12L, 14L, 14L,
11L, 10L, 15L, 12L, 14L, 11L, 16L, 15L, 12L, 12L, 14L, 13L,
15L, 14L, 16L, 11L, 15L, 13L, 17L, 11L, 13L, 13L, 15L, 13L,
17L, 15L, 17L, 11L, 13L, 15L, 12L, 16L, 12L, 10L, 16L, 13L,
12L, 14L, 14L, 14L, 12L, 15L, 12L, 12L, 14L, 13L, 14L, 12L,
11L, 11L, 16L, 12L, 13L, 13L, 14L, 12L, 13L, 13L, 11L, 11L,
12L, 11L, 14L, 12L, 14L, 13L, 12L, 15L, 13L, 12L, 15L, 11L,
13L, 13L, 12L, 12L, 11L, 13L, 14L, 13L, 11L, 11L, 12L, 11L,
12L, 12L, 15L, 17L, 13L, 10L, 16L, 12L, 13L, 12L, 12L, 13L,
14L, 13L, 15L, 15L, 12L, 17L, 15L, 12L, 12L, 14L, 12L, 12L,
11L, 16L, 12L, 11L, 12L, 11L, 17L, 11L, 13L, 12L, 16L, 13L,
14L, 12L, 15L, 16L, 12L, 14L, 13L, 13L, 12L, 12L), NLV = c(0,
1, 1, 0, 1, 0, 0, 0, 2, 0, 3, 0, 2, 1, 0, 0, 0, 1, 0, 0,
1, 0, 4, 0, 1, 2, 2, 2, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 2,
0, 0, 2, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0,
1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1,
0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1,
0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 2, 2, 1, 0, 2,
0, 0, 0, 1, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0,
0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 4, 0, 0, 0, 0, 1, 1, 0, 1,
0, 0, 0, 4, 1, 0, 0, 1, 3, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1,
0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 2, 1, 1, 0,
0, 0, 0, 0, 2, 1, 1, 1, 0), LIV = c(5, 3, 3, 3, 3, 2, 5,
2, 2, 2, 2, 2, 3, 1, 4, 2, 2, 2, 2, 6, 3, 2, 3, 1, 4, 2,
3, 2, 2, 2, 3, 2, 3, 3, 4, 2, 2, 2, 3, 1, 4, 2, 3, 2, 1,
4, 3, 1, 4, 1, 2, 2, 5, 2, 2, 1, 1, 2, 2, 2, 0, 3, 2, 3,
3, 3, 3, 7, 3, 3, 5, 2, 5, 2, 3, 3, 3, 2, 2, 3, 3, 1, 3,
2, 4, 1, 4, 3, 2, 1, 3, 2, 3, 5, 2, 3, 2, 2, 2, 3, 5, 3,
3, 0, 2, 2, 2, 6, 4, 3, 3, 4, 2, 2, 6, 3, 3, 3, 2, 5, 5,
4, 2, 5, 4, 2, 3, 3, 3, 1, 2, 0, 4, 5, 2, 3, 1, 3, 2, 5,
11, 3, 7, 1, 4, 4, 6, 3, 2, 1, 1, 3, 3, 2, 1, 3, 4, 2, 2,
5, 4, 3, 3, 4, 3, 3, 1, 2, 1, 1, 5, 7, 2, 1, 2, 6, 3, 1,
2, 2, 4, 3, 4, 1, 6, 4, 4, 2, 3, 4, 5, 4, 1, 3, 4, 3, 2,
2, 2, 2), WT = c(118L, 175L, 135L, 125L, 118L, 183L, 218L,
192L, 125L, 123L, 140L, 160L, 150L, 130L, 140L, 130L, 150L,
148L, 134L, 138L, 116L, 145L, 195L, 180L, 137L, 135L, 155L,
120L, 126L, 191L, 185L, 119L, 129L, 170L, 110L, 155L, 105L,
115L, 120L, 150L, 135L, 110L, 170L, 145L, 170L, 140L, 240L,
100L, 92L, 160L, 155L, 132L, 110L, 145L, 155L, 110L, 129L,
131L, 218L, 115L, 110L, 130L, 97L, 120L, 130L, 150L, 123L,
145L, 135L, 132L, 205L, 127L, 120L, 145L, 175L, 144L, 123L,
170L, 134L, 155L, 125L, 140L, 120L, 134L, 150L, 117L, 147L,
124L, 129L, 170L, 153L, 130L, 145L, 140L, 155L, 116L, 115L,
175L, 179L, 119L, 153L, 185L, 280L, 140L, 126L, 193L, 140L,
116L, 140L, 138L, 175L, 155L, 125L, 113L, 110L, 190L, 114L,
126L, 159L, 170L, 156L, 161L, 150L, 115L, 95L, 235L, 145L,
123L, 145L, 155L, 115L, 190L, 120L, 110L, 148L, 120L, 132L,
115L, 125L, 120L, 155L, 170L, 180L, 179L, 137L, 107L, 144L,
189L, 80L, 142L, 150L, 154L, 90L, 150L, 102L, 110L, 101L,
109L, 210L, 198L, 124L, 133L, 120L, 165L, 130L, 240L, 125L,
183L, 130L, 105L, 123L, 180L, 130L, 104L, 158L, 160L, 108L,
127L, 145L, 127L, 132L, 140L, 178L, 130L, 130L, 265L, 195L,
125L, 105L, 161L, 135L, 185L, 115L, 140L, 145L, 195L, 138L,
118L, 129L, 180L), AGLP = c(39L, 39L, 39L, 40L, 39L, 38L,
38L, 37L, 38L, 38L, 37L, 38L, 38L, 38L, 38L, 38L, 38L, 38L,
39L, 38L, 39L, 35L, 35L, 38L, 37L, 37L, 37L, 38L, 36L, 36L,
37L, 37L, 36L, 34L, 36L, 35L, 29L, 36L, 36L, 36L, 35L, 35L,
36L, 36L, 34L, 35L, 34L, 35L, 33L, 33L, 32L, 33L, 33L, 29L,
29L, 33L, 32L, 32L, 26L, 32L, 30L, 30L, 31L, 31L, 50L, 53L,
35L, 46L, 53L, 44L, 42L, 50L, 52L, 46L, 51L, 50L, 33L, 39L,
53L, 39L, 53L, 50L, 41L, 45L, 56L, 36L, 52L, 52L, 34L, 54L,
50L, 55L, 53L, 56L, 55L, 43L, 51L, 42L, 50L, 47L, 53L, 55L,
42L, 25L, 44L, 50L, 55L, 47L, 52L, 50L, 47L, 50L, 36L, 45L,
40L, 48L, 50L, 43L, 42L, 42L, 52L, 50L, 45L, 51L, 49L, 44L,
44L, 49L, 48L, 48L, 48L, 29L, 47L, 47L, 45L, 45L, 47L, 29L,
47L, 39L, 46L, 45L, 46L, 40L, 46L, 46L, 46L, 39L, 45L, 38L,
45L, 46L, 45L, 45L, 28L, 45L, 45L, 40L, 40L, 33L, 45L, 45L,
46L, 35L, 44L, 45L, 44L, 44L, 44L, 44L, 33L, 44L, 43L, 43L,
21L, 39L, 29L, 27L, 27L, 29L, 50L, 49L, 43L, 49L, 47L, 42L,
50L, 47L, 27L, 31L, 36L, 41L, 41L, 41L, 40L, 41L, 42L, 41L,
41L, 41L), MST = structure(c(1L, 3L, 2L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 5L, 2L, 1L, 1L, 2L, 1L, 1L, 4L, 1L, 2L,
1L, 2L, 5L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 5L, 1L, 1L, 1L,
1L, 5L, 3L, 1L, 1L, 1L, 3L, 1L, 1L, 1L, 1L, 1L, 2L, 4L, 5L,
4L, 1L, 5L, 4L, 4L, 1L, 5L, 3L, 1L, 5L, 1L, 4L, 4L, 1L, 1L,
1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 4L, 1L, 4L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 4L, 5L, 1L, 1L, 1L, 1L, 3L,
5L, 1L, 4L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 5L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 4L, 1L,
1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 5L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 4L, 1L, 4L, 1L, 1L,
4L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 4L, 1L, 1L, 1L,
1L, 3L, 4L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L), .Label = c("1",
"2", "3", "4", "5"), class = "factor")), .Names = c("STR",
"OBS", "AGMT", "FNDX", "HIGD", "DEG", "CHK", "AGP1", "AGMN",
"NLV", "LIV", "WT", "AGLP", "MST"), row.names = c(NA, -200L), class = "data.frame")
Could it be bbdm[train] that it can't find, rather than train itself? What error message do you get?
You can use the browser command to debug here. i.e.
gcv<-function(formula,data=NULL,method="rpart",cross=5,times=10,k=7,layer=5,seed=0)
{
set=data;
n=nrow(set);
set.seed(as.vector(Sys.time()));
bb1=1:n;
bb2=rep(1:cross,ceiling(n/cross))[1:n];
bb2=sample(bb2,n);
samp=sample(c(1:n),size=n);
m=ceiling(n/cross);
smp<-mat.or.vec(cross,m);
j=rep(0,cross)
for (i in 1:n)
{
smp[bb2[i],j[bb2[i]]]=i
j[bb2[i]]=j[bb2[i]]+1
}
# Here we separate the original set into 5(variable cross)sets,
# each time we take one out and treat it as the testing set
mf <- match.call(expand.dots = FALSE)
m <- match(c("formula","data"), names(mf), 0L)
mf <- mf[c(1L, m)]
mf$drop.unused.levels <- TRUE
mf[[1L]] <- as.name("model.frame")
mf <- eval(mf, parent.frame())
response<-model.response(mf)
#code copied from function.lm
reslvl<-length(levels(response))
tra<-mat.or.vec(reslvl,reslvl);
tes<-mat.or.vec(reslvl,reslvl);
for (i in 1:cross)
{
test<-smp[i,];
train<-setdiff(1:200,test);
show(train); #THe 'train' set can be shown here.
#some "if" and "else"statements are hidden
if (method=="logistic")#logistic is running well
{
bb.log<-step(glm(formula,set,family=binomial),trace=FALSE)
tra<-tra+as.vector(t(table(response[train],
bin(predict.glm(bb.log,set[train,],type="response")))))
tes<-tes+as.vector(t(table(response[test],
bin(predict.glm(bb.log,set[test,],type="response")))))
}
else if (method=="clogit")#clogit is meeting a problem.
{
##### BROWSER() CALL ##########
browser()
library("survival")
bb.clog<-step(clogit(formula,bbdm[train,]),trace=FALSE)
tra<-tra+as.vector(t(table( response[train],
bin(predict(bb.clog,set[train,])))))
tes<-tes+as.vector(t(table( response[test],
bin(predict(bb.clog,set[test,])))))
}
}
tra<-tra/cross;
tes<-tes/cross;
trainrate=1-sum(diag(tra))/sum(tra)
testrate=1-sum(diag(tes))/sum(tes)
result<-list(Train=tra,TrainRate=trainrate,Test=tes,TestRate=testrate)
result
}
Browser can be used to debug functions like this. Essentially, when you run the code, you'll enter into the environment at the moment browser was called. This will allow you to explore and see if the variables are what you thought they were. You can do an ls() to see which objects are defined, or try to find the value of train or (my suspicion) bbdm to see that they're all properly defined.

Resources