'mydata$column <- NULL' not working in function - r

This is the function:
remove_column <- function(column_vector) {
for (column in column_vector) {
if (grepl('.y$', column)) {
mydata$column <- NULL
}
}
}
What I think it'd doing: I'm passing a vector of my column names to the function, it's looping through list of names and asking whether the last characters of each column name are ".y". If that is the case, the function eliminates the column.
I've tried putting prints here and there to see my vector and to see whether the conditional evaluates to TRUE or FALSE, and everything seems to be working fine, but for some reason, it doesn't get rid of the column.
The following function returns my column vector:
duplicate_names <- function(col_names) {
duplicates <- c()
for (name in col_names) {
# split by period i.e. colname.x would be [colname, x]
if (lengths(strsplit(name, '\\.')) > 1) {
duplicates <- c(duplicates, name)
}
}
return(duplicates)
}
I usually call it like this:
duplicate_names(names(mydata))
This is what the vector of columns looks like:
c('v1.x', 'v2.y')
When I print the function it returns the following:
[1] "v1.x" "v2.y"
As requested by a user, the dput(droplevels(horsedata[1:5, 1:5])) (data that I am using for this):
dput(droplevels(horsedata[1:5, 1:5]))
structure(list(ÿþhorse_name = structure(c(3L, 1L, 2L, 4L, 5L), .Label = c("IM PRETTY FAMES",
"JESS ROYAL BUCKS", "KISS ME IM SUGAR", "LOLAMO", "RUN MADISON RUN"
), class = "factor"), owner_name = structure(c(3L, 2L, 1L, 5L,
4L), .Label = c("Christine Tavares", "Heste Sport, Inc.", "Picov Cattle Co.",
"Procter, Wayne and Carol", "Ruth F. Barbour"), class = "factor"),
program = structure(1:5, .Label = c("1", "2", "3", "4", "5"
), class = "factor"), pp = 1:5, todays_cls = c(61L, 61L,
61L, 61L, 61L)), .Names = c("ÿþhorse_name", "owner_name",
"program", "pp", "todays_cls"), row.names = c(NA, 5L), class = "data.frame")

We don't need a loop to subset the columns.
mydata[!grepl('\\.y$', column_list)]
If there are other columns not in the column_list and we want to keep them (assuming that the 'column_list' is ordered)
mydata[setdiff(1:ncol(mydata), grep('\\.y$', column_list))]
We can modify your function by
changing .y$ to \\.y$ as . means any character and not just the dot
Instead of $, we use [ to subset the dataset
Return the dataset after the assignment
remove_column <- function(dat, column_vec) {
for (column in column_vec) {
if (grepl('\\.y$', column, perl=TRUE)) {
dat[column] <- NULL
}
}
dat
}
remove_column(mydata, column_list)
# v1.x v2.x v3
#1 6 1 9
#2 4 11 7
#3 14 15 5
#4 10 2 4
#5 13 4 0
#6 19 14 1
#7 5 1 8
#8 16 12 7
#9 16 13 5
#10 5 0 7
data
mydata <- structure(list(v1.x = c(6L, 4L, 14L, 10L, 13L, 19L, 5L, 16L,
16L, 5L), v1.y = c(12L, 7L, 14L, 14L, 6L, 18L, 4L, 0L, 10L, 2L
), v2.x = c(1L, 11L, 15L, 2L, 4L, 14L, 1L, 12L, 13L, 0L), v2.y = c(6L,
5L, 7L, 3L, 19L, 4L, 15L, 13L, 14L, 20L), v3 = c(9L, 7L, 5L,
4L, 0L, 1L, 8L, 7L, 5L, 7L)), .Names = c("v1.x", "v1.y", "v2.x",
"v2.y", "v3"), row.names = c(NA, -10L), class = "data.frame")
column_list <- c('v1.x', 'v1.y', 'v2.x', 'v2.y')

Related

calculate the number of values in a column as a percentage using apply

my data frame:
data <-structure(list(col1 = c(1L, 2L, 2L, 4L, 5L, 6L, 7L, 8L, 9L),
col2 = c(NA, 5L, 6L, 7L, 8L, NA, 10L, 11L, 12L), col3 = c(6L,
7L, 8L, 9L, 10L, 11L, 12L, 13L, NA), col4 = c(9L, 10L, NA,
12L, 15L, 14L, 15L, 16L, 17L), col5 = c(12L, 13L, 14L, 15L,
16L, 17L, 18L, NA, 20L), GROUP = c(3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L), col6 = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 0L),
col7 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L), col8 = c(1L,
1L, 1L, 4L, 5L, 6L, 7L, 8L, 9L), col9 = 10:18), class = "data.frame", row.names = c(NA,
-9L))
I want to output each value from a column and calculate its number and how much it is in percentages for col9 and col10
I use apply this way:
k <- paste(apply(data[,c(9,10)], 2,table),apply(data[,c(9,10)], 2, prop.table))
and connect them into a cell
k <- apply(k,2,paste,collapse="\n")
but as a result I get an error
apply(k, 2, paste, collapse = "\n") :dim(X)
must have a positive length
UPDATE:
what I expect to get
new_data<-structure(list(col8 = "1 - 3 (33.3%)\n4 - 1 (11.1%)\n5 - 1 (11.1%)\n6 - 1 (11.1%)\n7 - 1 (11.1%)\n8 - 1 (11.1%)\n9 - 1 (11.1%)",
col9 = "1 - 0 (0%)\n10 - 1 (11.1%)\n11 - 1 (11.1%)\n12 - 1 (11.1%)\n13 - 1 (11.1%)\n14 - 1 (11.1%)\n15 - 1 (11.1%)\n16 - 1 (11.1%)\n17 - 1 (11.1%)\n18 - 1 (11.1%)"), class = "data.frame", row.names = c(NA,
-1L))
Define a function f that does the job for one column, and use lapply.
f <- \(x) {
tx <- table(x)
px <- proportions(tx)
paste(sprintf('%s - %s (%s%%)', names(tx), tx, round(px*100, 1)), collapse='\n')
}
lapply(data[9:10], f)
# $col8
# [1] "1 - 3 (33.3%)\n4 - 1 (11.1%)\n5 - 1 (11.1%)\n6 - 1 (11.1%)\n7 - 1 (11.1%)\n8 - 1 (11.1%)\n9 - 1 (11.1%)"
#
# $col9
# [1] "10 - 1 (11.1%)\n11 - 1 (11.1%)\n12 - 1 (11.1%)\n13 - 1 (11.1%)\n14 - 1 (11.1%)\n15 - 1 (11.1%)\n16 - 1 (11.1%)\n17 - 1 (11.1%)\n18 - 1 (11.1%)"
Note, that prop.table is an earlier name, retained for back-compatibility, use proportions instead.

Why my dataset changed after running the setDT()

I have 2 CSV files. Now I want to find the common rows of these 2 files. So, after reading them as dataframe I converted them as datatable and then merge them. But, somehow, my code is not working. After using setDT() my dataset is changed and I am not getting any common rows between them!
Before running my dataset
nodeA nodeB scr
1 ID08918 ID04896 1
2 ID00402 ID01198 1
3 ID00182 ID01576 1
4 ID06413 ID00745 1
5 ID00215 ID01175 1
6 ID00448 ID05351 1
7 ID00860 ID00959 0.996197718631179
8 ID01110 ID01127 0.99604743083004
9 ID00497 ID01192 0.995436766623207
10 ID00877 ID01590 0.993939393939394
11 ID01192 ID01183 0.992202729044834
12 ID00361 ID00570 0.988354430379747
13 ID01045 ID01201 0.98766954377312
14 ID11641 ID00541 0.986875315497224
15 ID11641 ID00570 0.98685540950455
16 ID00458 ID01151 0.986813186813187
17 ID00199 ID01211 0.981416957026713
18 ID00570 ID00309 0.981151299032094
19 ID00541 ID00309 0.978161503301168
20 ID00603 ID06789 0.977272727272727
library(dplyr)
df_1 <- read.csv("~/df_1.csv", stringsAsFactors = FALSE)
df_2 <- read.csv("~/df_2.csv", stringsAsFactors = FALSE)
library(data.table)
setDT(df_1)[,c("nodeA", "nodeB") := list(pmin(nodeA,nodeB), pmax(nodeA,nodeB))]
setDT(df_2)[,c("nodeA", "nodeB") := list(pmin(nodeA,nodeB), pmax(nodeA,nodeB))]
result <- merge(df_1[!duplicated(df_1),], df_2, allow.cartesian=TRUE)
After running the code my dataset is changed.
nodeA nodeB scr
1: ID08918 ID08918 1
2: ID00402 ID00402 1
3: ID00182 ID00182 1
4: ID06413 ID06413 1
5: ID00215 ID00215 1
6: ID00448 ID00448 1
7: ID00860 ID00860 0.996197718631179
8: ID01110 ID01110 0.99604743083004
9: ID00497 ID00497 0.995436766623207
10: ID00877 ID00877 0.993939393939394
11: ID01192 ID01192 0.992202729044834
12: ID00361 ID00361 0.988354430379747
13: ID01045 ID01045 0.98766954377312
14: ID11641 ID11641 0.986875315497224
15: ID11641 ID11641 0.98685540950455
16: ID00458 ID00458 0.986813186813187
17: ID00199 ID00199 0.981416957026713
18: ID00570 ID00570 0.981151299032094
19: ID00541 ID00541 0.978161503301168
20: ID00603 ID00603 0.977272727272727
Reproducible Dataset
df_1
structure(list(query = structure(c(18L, 5L, 1L, 17L, 3L, 6L,
12L, 15L, 8L, 13L, 16L, 4L, 14L, 19L, 19L, 7L, 2L, 10L, 9L, 11L
), .Label = c("ID00182", "ID00199", "ID00215", "ID00361", "ID00402",
"ID00448", "ID00458", "ID00497", "ID00541", "ID00570", "ID00603",
"ID00860", "ID00877", "ID01045", "ID01110", "ID01192", "ID06413",
"ID08918", "ID11641"), class = "factor"), target = structure(c(16L,
11L, 14L, 4L, 8L, 17L, 5L, 6L, 10L, 15L, 9L, 3L, 12L, 2L, 3L,
7L, 13L, 1L, 1L, 18L), .Label = c("ID00309", "ID00541", "ID00570",
"ID00745", "ID00959", "ID01127", "ID01151", "ID01175", "ID01183",
"ID01192", "ID01198", "ID01201", "ID01211", "ID01576", "ID01590",
"ID04896", "ID05351", "ID06789"), class = "factor"), new_ssp = structure(c(15L,
15L, 15L, 15L, 15L, 15L, 14L, 13L, 12L, 11L, 10L, 9L, 8L, 7L,
6L, 5L, 4L, 3L, 2L, 1L), .Label = c("0.977272727272727", "0.978161503301168",
"0.981151299032094", "0.981416957026713", "0.986813186813187",
"0.98685540950455", "0.986875315497224", "0.98766954377312",
"0.988354430379747", "0.992202729044834", "0.993939393939394",
"0.995436766623207", "0.99604743083004", "0.996197718631179",
"1"), class = "factor")), class = "data.frame", row.names = c(NA,
-20L))
df_2
structure(list(nodeA = structure(c(4L, 2L, 1L, 1L, 1L, 4L, 1L,
9L, 3L, 4L, 2L, 8L, 2L, 1L, 5L, 7L, 3L, 6L, 2L, 1L), .Label = c("ID00309",
"ID00361", "ID00541", "ID00570", "ID00615", "ID00696", "ID00762",
"ID01200", "ID05109"), class = "factor"), nodeB = structure(c(8L,
3L, 3L, 1L, 2L, 7L, 9L, 8L, 8L, 6L, 9L, 7L, 4L, 4L, 6L, 9L, 6L,
7L, 5L, 5L), .Label = c("ID00361", "ID00541", "ID00570", "ID00615",
"ID00696", "ID01200", "ID05109", "ID11641", "ID11691"), class = "factor"),
scr = structure(20:1, .Label = c("1.85284606048794", "1.90444166064472",
"1.90762235378507", "1.94364188077133", "1.95883206119256",
"2.08440437841349", "2.26408172709962", "2.3223132020942",
"2.46120775935034", "2.49647215035727", "2.50432367561777",
"2.57541320006514", "2.65099330092281", "2.75209155741549",
"2.93717640337986", "2.99596628688011", "3.21209741517806",
"3.21997803385465", "3.48788394772132", "3.81389707587156"
), class = "factor")), class = "data.frame", row.names = c(NA,
-20L))
Note: I am also using dplyr for some purposes like %>% etc. Does it mean, dplyr and data.table is conflicting somehow?
one possible solution with dplyr, inner_join and union from dplyr:
# inner join
df_2 %>%
dplyr::inner_join(df_1, by = c("nodeA" = "query", "nodeB" = "target")) %>%
dplyr::mutate(GROUP = 1) %>%
dplyr::union(df_2 %>%
dplyr::inner_join(df_1, by = c("nodeB" = "query", "nodeA" = "target")) %>%
dplyr::mutate(GROUP = 2))
nodeA nodeB scr new_ssp GROUP
1 ID00361 ID00570 3.48788394772132 0.988354430379747 1
2 ID00570 ID11641 3.81389707587156 0.98685540950455 2
3 ID00309 ID00570 3.21997803385465 0.981151299032094 2
4 ID00309 ID00541 2.99596628688011 0.978161503301168 2
5 ID00541 ID11641 2.57541320006514 0.986875315497224 2

Sum consecutive hours when condition is met

I have a dataframe that has a timestamp and a numeric variable, the data is recorded once every hour. Ultimately, I'd life to know the mean number of hours that the variable is at or below a certain value. For example, what is the average number of hours that var1 is at or below 4? There are missing timestamps in the dataframe, so if the time is not consecutive the sum needs to restart.
In the example data frame the columns HoursBelow5 and RunningGroup were generated 'by hand', if I could create these columns programmatically, I could filter to remove the RunningGroups that were associate with var1 values greater than 4 and then use dplyr::slice to get the maximum HoursBelow5 per group. I could then find the mean of these values.
So, in this approach I would need to create the restarting cumulative sum HoursBelow5, which restarts when the condition var1<5 is not met, or when the timestamp is not consecutive hours. I could then use ifelse statements to create the RunningGroup variable. Is this possible? I may be lacking the jargon to find the procedure. Cumsum and lag seemed promising, but I have yet to construct a procedure that does the above.
Or, there may be a smarter way to do this using the timestamp.
edit: result incorporating code from answer below
df1 <- df %>%
group_by(group = data.table::rleid(var1 > 4),
group1 = cumsum(ts - lag(ts, default = first(ts)) > 3600)) %>%
mutate(temp = row_number() * (var1 <= 4)) %>%
ungroup() %>%
filter(var1 <= 4) %>%
select(ts, var1, temp)
df2 <- df1 %>% mutate(temp2 = ifelse(temp==1, 1, 0),
newgroup = cumsum(temp2))
df3 <- df2 %>% group_by(newgroup) %>% slice(which.max(temp))
mean(df3$temp)
# example dataframe with desired output columns to then get actual output
df <- structure(list(ts = structure(c(-2208967200, -2208963600, -2208960000,
-2208956400, -2208952800, -2208949200, -2208945600, -2208942000,
-2208938400, -2208934800, -2208931200, -2208927600, -2208924000,
-2208913200, -2208909600, -2208906000, -2208902400, -2208898800,
-2208895200, -2208891600, -2208888000, -2208884400, -2208880800,
-2208877200, -2208852000, -2208848400, -2208844800, -2208841200,
-2208837600, -2208834000, -2208830400, -2208826800, -2208823200,
-2208819600, -2208816000, -2208812400, -2208808800, -2208805200,
-2208801600), class = c("POSIXct", "POSIXt"), tzone = ""), var1 = c(1L,
3L, 4L, 5L, 4L, 3L, 5L, 6L, 7L, 8L, 3L, 2L, 2L, 2L, 3L, 3L, 2L,
2L, 1L, 1L, 1L, 1L, 4L, 4L, 3L, 9L, 3L, 3L, 3L, 2L, 2L, 3L, 4L,
5L, 3L, 2L, 1L, 2L, 3L), HoursBelow5 = c(1L, 2L, 3L, 0L, 1L,
2L, 0L, 0L, 0L, 0L, 1L, 2L, 3L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L,
9L, 10L, 11L, 1L, 0L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 0L, 1L, 2L,
3L, 4L, 5L), RunningGroup = c(1L, 1L, 1L, 2L, 3L, 3L, 4L, 5L,
6L, 7L, 8L, 8L, 8L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L,
10L, 11L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 13L, 14L, 14L, 14L,
14L, 14L), NotContinuous = c("", "", "", "", "", "", "", "",
"", "", "", "", "", "NC", "", "", "", "", "", "", "", "", "",
"", "NC", "", "", "", "", "", "", "", "", "", "", "", "", "",
"")), row.names = c(NA, -39L), class = "data.frame")
One way could using dplyr and data.table::rleid could be
library(dplyr)
df %>%
group_by(group = data.table::rleid(var1 > 4),
group1 = cumsum(ts - lag(ts, default = first(ts)) > 3600)) %>%
mutate(temp = row_number() * (var1 <= 4)) %>%
ungroup() %>%
select(ts, var1, HoursBelow5, temp)
# ts var1 HoursBelow5 temp
# <dttm> <int> <int> <int>
# 1 1900-01-01 12:46:46 1 1 1
# 2 1900-01-01 13:46:46 3 2 2
# 3 1900-01-01 14:46:46 4 3 3
# 4 1900-01-01 15:46:46 5 0 0
# 5 1900-01-01 16:46:46 4 1 1
# 6 1900-01-01 17:46:46 3 2 2
# 7 1900-01-01 18:46:46 5 0 0
# 8 1900-01-01 19:46:46 6 0 0
# 9 1900-01-01 20:46:46 7 0 0
#10 1900-01-01 21:46:46 8 0 0
# … with 29 more rows
temp column is the one which was generated programmatically and HoursBelow5 is kept as it is for comparison purposes. If you also need RunningGroup you could use group and group1 together.

merge based on an id with missing values and string

my df is shown below
mydf<- structure(list(IDs = c(11L, 16L, 19L, 21L, 22L, 24L, 42L, 43L,
51L), string1 = structure(c(1L, 8L, 7L, 2L, 4L, 9L, 6L, 3L, 5L
), .Label = c("b", "g", "hue", "hyu", "if", "jud", "ufhy", "uhgf;ffugf",
"uhgs"), class = "factor"), IDs.1 = c(4L, 11L, 16L, 19L, 20L,
22L, 29L, NA, NA), string2 = structure(c(2L, 3L, 8L, 7L, 4L,
5L, 6L, 1L, 1L), .Label = c("", "a", "b", "higf;hdugd", "hyu",
"inja", "ufhy", "uhgf;ffugf"), class = "factor")), .Names = c("IDs",
"string1", "IDs.1", "string2"), class = "data.frame", row.names = c(NA,
-9L))
I want to get them together like below
myout<- structure(list(Ids = c(4L, 11L, 16L, 19L, 20L, 21L, 22L, 24L,
29L, 42L, 43L, 51L), string = structure(c(1L, 2L, 11L, 10L, 4L,
3L, 6L, 12L, 8L, 9L, 5L, 7L), .Label = c("a", "b", "g", "higf;hdugd",
"hue", "hyu", "if", "inja", "jud", "ufhy", "uhgf;ffugf", "uhgs"
), class = "factor")), .Names = c("Ids", "string"), class = "data.frame", row.names = c(NA,
-12L))
I tried to do it using merge
df1 <- mydf[,1:2]
df2 <- mydf[,3:4]
df3 = merge(df1, df2, by.x=c("IDs", "string"))
which gives me an error because they are unequal
I also tried to use the approach given here
How to join (merge) data frames (inner, outer, left, right)? which did not solve my problem
my input is like this
IDs string1 IDs string2
11 b 4 a
16 uhgf;ffugf 11 b
19 ufhy 16 uhgf;ffugf
21 g 19 ufhy
22 hyu 20 higf;hdugd
24 uhgs 22 hyu
42 jud 29 inja
43 hue
51 if
and the output looks like this
Ids string
4 a
11 b
16 uhgf;ffugf
19 ufhy
20 higf;hdugd
21 g
22 hyu
24 uhgs
29 inja
42 jud
43 hue
51 if
e.g. 11, 16 etc are repeated twice , so we only want them once
We can do an rbind and remove the duplicated elements
library(data.table)
setnames(rbindlist(list(mydf[3:4], mydf[1:2]))[!is.na(IDs.1)&!duplicated(IDs.1)],
c("Ids", "string"))[order(Ids)]
# Ids string
# 1: 4 a
# 2: 11 b
# 3: 16 uhgf;ffugf
# 4: 19 ufhy
# 5: 20 higf;hdugd
# 6: 21 g
# 7: 22 hyu
# 8: 24 uhgs
# 9: 29 inja
#10: 42 jud
#11: 43 hue
#12: 51 if
Or another option is melt from data.table (to convert to 'long' format) which can take multiple measure patterns, then remove the duplicated 'Ids' and order using 'Ids'.
melt(setDT(mydf), measure = patterns("ID", "string"), na.rm=TRUE,
value.name = c("Ids", "string"))[!duplicated(Ids, fromLast=TRUE)
][, variable := NULL][order(Ids)]

Sum and replace columns with same name R for a data frame containing different classes

I have a data frame containing multiple classes, I would like to sum those columns that have the same name and are numeric, and replace the old columns with the new sum, does anyone know a way to do this?
i.e I have a data frame like:
col1 col2 col3 col3
char factor int int
I would like to produce
col1 col2 col3
char factor 2int
I have previously used:
data <- as.data.frame(do.call(cbind, by(t(data),INDICES=names(data),FUN=colSums)))
However this was on a dataframe that only had numeric variables.
There are other examples on the internet but not meeting the conditions of: replacement, preserving the rest of the frame, and of being on a frame with multiple classes
Similar question: how do I search for columns with same name, add the column values and replace these columns with same name by their sum? Using R
Try
dat1 <- dat #to keep a copy of the original dataset
indx <- sapply(dat, is.numeric)#check which columns are numeric
nm1 <- which(indx)#get the numeric index of the column
indx2 <- duplicated(names(nm1))#check which among the
# integer columns are duplicated
#use `Map` after splitting the "nm1" with its "names", do the `rowSums`
dat[ nm1[!indx2]] <- Map(function(x,y) rowSums(x[y]), list(dat),
split(nm1, names(nm1)))
dat[ -nm1[indx2]]
Update
Or to make it more efficient, only take the "duplicated" and "numeric" columns while leaving the others intact. Create an "index" (indx2) of columns that are duplicated. Subset the "nm1" based on the "indx2" and then do rowSums as described above. Finally, remove the unwanted columns (duplicated ones) by using the "indx3"
indx2 <- duplicated(names(nm1))|duplicated(names(nm1),fromLast=TRUE)
nm2 <- nm1[indx2]
indx3 <- duplicated(names(nm2))
dat[nm2[!indx3]] <- Map(function(x,y) rowSums(x[y]),
list(dat),split(nm2, names(nm2)))
datN <- dat[ -nm2[indx3]]
datN
# col1 col2 col3 col4 col5
#1 16 23 2 10 10
#2 10 18 12 8 18
#3 21 23 15 6 10
#4 14 37 3 5 15
#5 29 39 5 1 11
#6 26 31 14 2 20
#7 25 31 2 8 10
#8 36 31 12 8 6
#9 32 26 13 6 4
#10 16 38 1 7 3
Checking the results
rowSums(dat1[names(dat1) %in% 'col1'])
#[1] 16 10 21 14 29 26 25 36 32 16
rowSums(dat1[names(dat1) %in% 'col2'])
#[1] 23 18 23 37 39 31 31 31 26 38
data
dat <- structure(list(col1 = c(6L, 5L, 15L, 11L, 14L, 19L, 6L, 16L,
17L, 6L), col2 = c(13L, 8L, 14L, 14L, 7L, 19L, 4L, 1L, 11L, 3L
), col3 = structure(c(2L, 5L, 8L, 3L, 4L, 7L, 2L, 5L, 6L, 1L), .Label = c("1",
"2", "3", "5", "12", "13", "14", "15"), class = "factor"), col2 = c(7L,
5L, 8L, 3L, 19L, 5L, 15L, 13L, 14L, 20L), col4 = structure(c(7L,
6L, 4L, 3L, 1L, 2L, 6L, 6L, 4L, 5L), .Label = c("1", "2", "5",
"6", "7", "8", "10"), class = "factor"), col5 = c(10L, 18L, 10L,
15L, 11L, 20L, 10L, 6L, 4L, 3L), col1 = c(10L, 5L, 6L, 3L, 15L,
7L, 19L, 20L, 15L, 10L), col2 = c(3L, 5L, 1L, 20L, 13L, 7L, 12L,
17L, 1L, 15L)), .Names = c("col1", "col2", "col3", "col2", "col4",
"col5", "col1", "col2"), row.names = c(NA, -10L), class = "data.frame")

Resources