In a dataset which contains many ids, I am only trying to manipulate rows which have id 7 or 9, and leave everything else untouched.
I am trying to conditionally remove a row from 7 or 9 in all instances where there isn't a variable that corresponds to it. So, if in the case of the dput example below, I want to remove the ninth row from id=9 because id=7 does not have an itemcode=2. Vice versa for id=7, I am trying to remove its itemcode=9 because id=9 does not have it.
id client item itemcode unit X2001 X2002 X2003 X2004 X2005 X2006 X2007
...
7 7 Bob eighth 8 100 13 18 15 NA NA NA NA
8 7 Bob ninth 9 100 11 21 10 NA NA NA NA
9 9 Bob_new first 1 100 NA NA NA 23 18 25 18
Code:
structure(list(id = c(7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 9L, 9L,
9L, 9L, 9L, 9L, 9L, 9L, 10L), client = structure(c(1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L), .Label = c("Bob",
"Bob_new", "Mark"), class = "factor"), item = structure(c(3L,
9L, 4L, 2L, 8L, 7L, 1L, 5L, 3L, 6L, 9L, 4L, 2L, 8L, 7L, 1L, 3L
), .Label = c("eighth", "fifth", "first", "fourth", "ninth",
"second", "seventh", "sixth", "third"), class = "factor"), itemcode = c(1L,
3L, 4L, 5L, 6L, 7L, 8L, 9L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 1L
), unit = c(100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L,
100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L), X2001 = structure(c(5L,
6L, 1L, 4L, 2L, 5L, 3L, 1L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L
), .Label = c("11", "12", "13", "22", "24", "25", "NA"), class = "factor"),
X2002 = structure(c(4L, 8L, 1L, 3L, 7L, 2L, 5L, 6L, 9L, 9L,
9L, 9L, 9L, 9L, 9L, 9L, 9L), .Label = c("13", "14", "15",
"17", "18", "21", "22", "24", "NA"), class = "factor"), X2003 = structure(c(5L,
1L, 4L, 2L, 6L, 1L, 3L, 1L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L,
7L), .Label = c("10", "11", "15", "19", "23", "24", "NA"), class = "factor"),
X2004 = structure(c(7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 5L, 4L,
2L, 6L, 1L, 3L, 4L, 3L, 4L), .Label = c("11", "14", "15",
"20", "23", "25", "NA"), class = "factor"), X2005 = structure(c(6L,
6L, 6L, 6L, 6L, 6L, 6L, 6L, 3L, 2L, 4L, 3L, 5L, 3L, 1L, 4L,
3L), .Label = c("11", "13", "18", "19", "25", "NA"), class = "factor"),
X2006 = structure(c(9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 8L, 6L,
1L, 2L, 5L, 3L, 7L, 8L, 4L), .Label = c("10", "15", "18",
"19", "20", "22", "23", "25", "NA"), class = "factor"), X2007 = structure(c(8L,
8L, 8L, 8L, 8L, 8L, 8L, 8L, 4L, 7L, 6L, 2L, 4L, 1L, 5L, 5L,
3L), .Label = c("12", "13", "16", "18", "19", "21", "24",
"NA"), class = "factor")), .Names = c("id", "client", "item",
"itemcode", "unit", "X2001", "X2002", "X2003", "X2004", "X2005",
"X2006", "X2007"), class = "data.frame", row.names = c(NA, -17L
))
————————————————————————————————————————
ANOTHER SCENARIO:
before:
structure(list(id = c(7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L,
7L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 10L), client = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 3L), .Label = c("Bob", "Bob_new", "Mark"), class = "factor"),
item = structure(c(3L, 9L, 10L, 9L, 4L, 2L, 8L, 7L, 7L, 1L,
5L, 3L, 6L, 9L, 4L, 2L, 8L, 7L, 1L, 3L), .Label = c("eighth",
"fifth", "first", "fourth", "ninth", "second", "seventh",
"sixth", "third", "third "), class = "factor"), itemcode = c(1L,
3L, 3L, 3L, 4L, 5L, 6L, 7L, 7L, 8L, 9L, 1L, 2L, 3L, 4L, 5L,
6L, 7L, 8L, 1L), type = structure(c(1L, 1L, 2L, 3L, 1L, 1L,
1L, 1L, 2L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("A",
"B", "C"), class = "factor"), unit = c(100L, 100L, 100L,
100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L,
100L, 100L, 100L, 100L, 100L, 100L, 100L), X2001 = c(24L,
25L, 30L, 26L, 11L, 22L, 12L, 25L, 24L, 13L, 11L, NA, NA,
NA, NA, NA, NA, NA, NA, NA), X2002 = c(17L, 24L, 12L, 96L,
13L, 15L, 22L, 21L, 14L, 18L, 21L, NA, NA, NA, NA, NA, NA,
NA, NA, NA), X2003 = c(23L, 10L, 46L, 94L, 19L, 11L, 24L,
19L, 10L, 15L, 10L, NA, NA, NA, NA, NA, NA, NA, NA, NA),
X2004 = c(NA, NA, 43L, 83L, NA, NA, NA, 6L, NA, NA, NA, 23L,
20L, 14L, 25L, 11L, 15L, 20L, 15L, 20L), X2005 = c(NA, NA,
97L, 86L, NA, NA, NA, 17L, NA, NA, NA, 18L, 13L, 19L, 18L,
25L, 18L, 11L, 19L, 18L), X2006 = c(NA, NA, 11L, 91L, NA,
NA, NA, 11L, NA, NA, NA, 25L, 22L, 10L, 15L, 20L, 18L, 23L,
25L, 19L), X2007 = c(NA, NA, 19L, 27L, NA, NA, NA, 15L, NA,
NA, NA, 18L, 24L, 21L, 13L, 18L, 12L, 19L, 19L, 16L)), .Names = c("id",
"client", "item", "itemcode", "type", "unit", "X2001", "X2002",
"X2003", "X2004", "X2005", "X2006", "X2007"), class = "data.frame", row.names = c(NA,
-20L))
after:
structure(list(id = c(7L, 7L, 7L, 7L, 7L, 7L, 9L, 9L, 9L, 9L,
9L, 9L, 10L), client = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 2L,
2L, 2L, 2L, 2L, 2L, 3L), .Label = c("Bob", "Bob_new", "Mark"), class = "factor"),
item = structure(c(2L, 7L, 3L, 1L, 5L, 4L, 2L, 6L, 3L, 1L,
5L, 4L, 2L), .Label = c("fifth", "first", "fourth", "seventh",
"sixth", "third", "third "), class = "factor"), itemcode = c(1L,
3L, 4L, 5L, 6L, 7L, 1L, 3L, 4L, 5L, 6L, 7L, 1L), type = structure(c(1L,
2L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L), .Label = c("A",
"B"), class = "factor"), unit = c(100L, 100L, 100L, 100L,
100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L), X2001 = c(24L,
10L, 11L, 22L, 12L, 17L, NA, NA, NA, NA, NA, NA, NA), X2002 = c(17L,
87L, 13L, 15L, 22L, 19L, NA, NA, NA, NA, NA, NA, NA), X2003 = c(23L,
47L, 19L, 11L, 24L, 17L, NA, NA, NA, NA, NA, NA, NA), X2004 = c(NA,
28L, NA, NA, NA, 28L, 23L, 14L, 25L, 11L, 15L, 20L, 20L),
X2005 = c(NA, 43L, NA, NA, NA, 16L, 18L, 19L, 18L, 25L, 18L,
11L, 18L), X2006 = c(NA, 69L, NA, NA, NA, 5L, 25L, 10L, 15L,
20L, 18L, 23L, 19L), X2007 = c(NA, 72L, NA, NA, NA, 20L,
18L, 21L, 13L, 18L, 12L, 19L, 16L)), .Names = c("id", "client",
"item", "itemcode", "type", "unit", "X2001", "X2002", "X2003",
"X2004", "X2005", "X2006", "X2007"), class = "data.frame", row.names = c(NA,
-13L))
I could implement the said filter code to remove items which do not exist in its corresponding place (id 7 and 9).
But if there are sub levels for items, like type of item. I am also trying to remove items if they don't have a type similar in the corresponding field.
You could use filter from dplyr
library(dplyr)
filter(df_all, itemcode %in% intersect(itemcode[id==7],
itemcode[id==9])|!id %in% c(7,9) )
# id client item itemcode unit X2001 X2002 X2003 X2004 X2005 X2006 X2007
#1 7 Bob first 1 100 24 17 23 NA NA NA NA
#2 7 Bob third 3 100 25 24 10 NA NA NA NA
#3 7 Bob fourth 4 100 11 13 19 NA NA NA NA
#4 7 Bob fifth 5 100 22 15 11 NA NA NA NA
#5 7 Bob sixth 6 100 12 22 24 NA NA NA NA
#6 7 Bob seventh 7 100 24 14 10 NA NA NA NA
#7 7 Bob eighth 8 100 13 18 15 NA NA NA NA
#8 9 Bob_new first 1 100 NA NA NA 23 18 25 18
#9 9 Bob_new third 3 100 NA NA NA 14 19 10 21
#10 9 Bob_new fourth 4 100 NA NA NA 25 18 15 13
#11 9 Bob_new fifth 5 100 NA NA NA 11 25 20 18
#12 9 Bob_new sixth 6 100 NA NA NA 15 18 18 12
#13 9 Bob_new seventh 7 100 NA NA NA 20 11 23 19
#14 9 Bob_new eighth 8 100 NA NA NA 15 19 25 19
#15 10 Mark first 1 100 NA NA NA 20 18 19 16
Update
Based on the new dataset, perhaps this helps
library(dplyr)
library(tidyr)
dfnew %>%
unite(itemtype, itemcode,type) %>%
filter(itemtype %in% intersect(itemtype[id==7],
itemtype[id==9])|!id %in% c(7,9)) %>%
separate(itemtype, c('itemcode', 'type'))
# id client item itemcode type unit X2001 X2002 X2003 X2004 X2005 X2006
# 1 7 Bob first 1 A 100 24 17 23 NA NA NA
# 2 7 Bob third 3 B 100 30 12 46 43 97 11
# 3 7 Bob fourth 4 A 100 11 13 19 NA NA NA
# 4 7 Bob fifth 5 A 100 22 15 11 NA NA NA
# 5 7 Bob sixth 6 A 100 12 22 24 NA NA NA
# 6 7 Bob seventh 7 A 100 25 21 19 6 17 11
# 7 9 Bob_new first 1 A 100 NA NA NA 23 18 25
# 8 9 Bob_new third 3 B 100 NA NA NA 14 19 10
# 9 9 Bob_new fourth 4 A 100 NA NA NA 25 18 15
# 10 9 Bob_new fifth 5 A 100 NA NA NA 11 25 20
# 11 9 Bob_new sixth 6 A 100 NA NA NA 15 18 18
# 12 9 Bob_new seventh 7 A 100 NA NA NA 20 11 23
# 13 10 Mark first 1 A 100 NA NA NA 20 18 19
# X2007
#1 NA
#2 19
#3 NA
#4 NA
#5 NA
#6 15
#7 18
#8 21
#9 13
#10 18
#11 12
#12 19
#13 16
If I understand the problem: every itemcode in id=9 subset must have identical itemcode in id=7 subset (and reverse). If it is not the case then we filter the row with the non-pair itemcode out, but leave everything with id not in 7 or 9. Here is one way of doing it:
First get common item codes:
items_9 <- df_all$itemcode[ df_all$id==9 ]
items_7 <- df_all$itemcode[ df_all$id==7 ]
items_common <- items_9[ items_9 %in% items_7 ]
select everything with common itemcodes for 7 and 9 and the rest:
df_new <- df_all[
which(
( df_all$id %in% c(7, 9) &
df_all$itemcode %in% items_common
) |
!df_all$id %in% c(7,9)
)
,]
library(dplyr)
df$remove <- paste(df$itemcode, df$type)
df<-invisible(filter(df,
remove %in% intersect(remove[type==7],
remove[type==9])|!type %in% c(7,9) ))
#Remove the additional column after filter
df$remove <- NULL
You could do something like this, which runs setdiff in both directions. The cl() function wasn't really necessary, but I really don't like writing the same expression over and over again.
f <- function(x, y) setdiff(union(x, y), x)
cl <- function(var) substitute(df$itemcode[df$id == x], list(x = var))
So now you can call f() on c(id7, id9) and then reverse it and get the c(id9, id7) result.
do.call(f, x <- list(cl(7), cl(9)))
# [1] 2
do.call(f, rev(x))
# [1] 9
Related
I have 2 CSV files. Now I want to find the common rows of these 2 files. So, after reading them as dataframe I converted them as datatable and then merge them. But, somehow, my code is not working. After using setDT() my dataset is changed and I am not getting any common rows between them!
Before running my dataset
nodeA nodeB scr
1 ID08918 ID04896 1
2 ID00402 ID01198 1
3 ID00182 ID01576 1
4 ID06413 ID00745 1
5 ID00215 ID01175 1
6 ID00448 ID05351 1
7 ID00860 ID00959 0.996197718631179
8 ID01110 ID01127 0.99604743083004
9 ID00497 ID01192 0.995436766623207
10 ID00877 ID01590 0.993939393939394
11 ID01192 ID01183 0.992202729044834
12 ID00361 ID00570 0.988354430379747
13 ID01045 ID01201 0.98766954377312
14 ID11641 ID00541 0.986875315497224
15 ID11641 ID00570 0.98685540950455
16 ID00458 ID01151 0.986813186813187
17 ID00199 ID01211 0.981416957026713
18 ID00570 ID00309 0.981151299032094
19 ID00541 ID00309 0.978161503301168
20 ID00603 ID06789 0.977272727272727
library(dplyr)
df_1 <- read.csv("~/df_1.csv", stringsAsFactors = FALSE)
df_2 <- read.csv("~/df_2.csv", stringsAsFactors = FALSE)
library(data.table)
setDT(df_1)[,c("nodeA", "nodeB") := list(pmin(nodeA,nodeB), pmax(nodeA,nodeB))]
setDT(df_2)[,c("nodeA", "nodeB") := list(pmin(nodeA,nodeB), pmax(nodeA,nodeB))]
result <- merge(df_1[!duplicated(df_1),], df_2, allow.cartesian=TRUE)
After running the code my dataset is changed.
nodeA nodeB scr
1: ID08918 ID08918 1
2: ID00402 ID00402 1
3: ID00182 ID00182 1
4: ID06413 ID06413 1
5: ID00215 ID00215 1
6: ID00448 ID00448 1
7: ID00860 ID00860 0.996197718631179
8: ID01110 ID01110 0.99604743083004
9: ID00497 ID00497 0.995436766623207
10: ID00877 ID00877 0.993939393939394
11: ID01192 ID01192 0.992202729044834
12: ID00361 ID00361 0.988354430379747
13: ID01045 ID01045 0.98766954377312
14: ID11641 ID11641 0.986875315497224
15: ID11641 ID11641 0.98685540950455
16: ID00458 ID00458 0.986813186813187
17: ID00199 ID00199 0.981416957026713
18: ID00570 ID00570 0.981151299032094
19: ID00541 ID00541 0.978161503301168
20: ID00603 ID00603 0.977272727272727
Reproducible Dataset
df_1
structure(list(query = structure(c(18L, 5L, 1L, 17L, 3L, 6L,
12L, 15L, 8L, 13L, 16L, 4L, 14L, 19L, 19L, 7L, 2L, 10L, 9L, 11L
), .Label = c("ID00182", "ID00199", "ID00215", "ID00361", "ID00402",
"ID00448", "ID00458", "ID00497", "ID00541", "ID00570", "ID00603",
"ID00860", "ID00877", "ID01045", "ID01110", "ID01192", "ID06413",
"ID08918", "ID11641"), class = "factor"), target = structure(c(16L,
11L, 14L, 4L, 8L, 17L, 5L, 6L, 10L, 15L, 9L, 3L, 12L, 2L, 3L,
7L, 13L, 1L, 1L, 18L), .Label = c("ID00309", "ID00541", "ID00570",
"ID00745", "ID00959", "ID01127", "ID01151", "ID01175", "ID01183",
"ID01192", "ID01198", "ID01201", "ID01211", "ID01576", "ID01590",
"ID04896", "ID05351", "ID06789"), class = "factor"), new_ssp = structure(c(15L,
15L, 15L, 15L, 15L, 15L, 14L, 13L, 12L, 11L, 10L, 9L, 8L, 7L,
6L, 5L, 4L, 3L, 2L, 1L), .Label = c("0.977272727272727", "0.978161503301168",
"0.981151299032094", "0.981416957026713", "0.986813186813187",
"0.98685540950455", "0.986875315497224", "0.98766954377312",
"0.988354430379747", "0.992202729044834", "0.993939393939394",
"0.995436766623207", "0.99604743083004", "0.996197718631179",
"1"), class = "factor")), class = "data.frame", row.names = c(NA,
-20L))
df_2
structure(list(nodeA = structure(c(4L, 2L, 1L, 1L, 1L, 4L, 1L,
9L, 3L, 4L, 2L, 8L, 2L, 1L, 5L, 7L, 3L, 6L, 2L, 1L), .Label = c("ID00309",
"ID00361", "ID00541", "ID00570", "ID00615", "ID00696", "ID00762",
"ID01200", "ID05109"), class = "factor"), nodeB = structure(c(8L,
3L, 3L, 1L, 2L, 7L, 9L, 8L, 8L, 6L, 9L, 7L, 4L, 4L, 6L, 9L, 6L,
7L, 5L, 5L), .Label = c("ID00361", "ID00541", "ID00570", "ID00615",
"ID00696", "ID01200", "ID05109", "ID11641", "ID11691"), class = "factor"),
scr = structure(20:1, .Label = c("1.85284606048794", "1.90444166064472",
"1.90762235378507", "1.94364188077133", "1.95883206119256",
"2.08440437841349", "2.26408172709962", "2.3223132020942",
"2.46120775935034", "2.49647215035727", "2.50432367561777",
"2.57541320006514", "2.65099330092281", "2.75209155741549",
"2.93717640337986", "2.99596628688011", "3.21209741517806",
"3.21997803385465", "3.48788394772132", "3.81389707587156"
), class = "factor")), class = "data.frame", row.names = c(NA,
-20L))
Note: I am also using dplyr for some purposes like %>% etc. Does it mean, dplyr and data.table is conflicting somehow?
one possible solution with dplyr, inner_join and union from dplyr:
# inner join
df_2 %>%
dplyr::inner_join(df_1, by = c("nodeA" = "query", "nodeB" = "target")) %>%
dplyr::mutate(GROUP = 1) %>%
dplyr::union(df_2 %>%
dplyr::inner_join(df_1, by = c("nodeB" = "query", "nodeA" = "target")) %>%
dplyr::mutate(GROUP = 2))
nodeA nodeB scr new_ssp GROUP
1 ID00361 ID00570 3.48788394772132 0.988354430379747 1
2 ID00570 ID11641 3.81389707587156 0.98685540950455 2
3 ID00309 ID00570 3.21997803385465 0.981151299032094 2
4 ID00309 ID00541 2.99596628688011 0.978161503301168 2
5 ID00541 ID11641 2.57541320006514 0.986875315497224 2
I have many data frames - Here is a simplified version of two of them.
flows <- structure(list(Student = c("Adam", "Char", "Fred", "Greg", "Ed", "Mick", "Dave", "Nick", "Tim", "George", "Tom"),
Class = c(1L, 1L, 1L, 1L, 2L, 2L, 3L, 3L, 3L, 3L, 3L), Jan_18_score = c(NA, 5L, -7L, 2L, 1L, NA, 5L, 8L, -2L, 5L, NA),
Feb_18_score = c(2L, 0, 8L, NA, 2L, 6L, NA, 8L, 7L, 3L, 8L), Jan_18_Weight = c(150L, 30L, NA, 80L, 60L, 80L, 40L, 12L, 23L, 65L, 78L),
Feb_18_Weight = c(153L, 60L, 80L, 40L, 80L, 30L, 25L, 45L, 40L, NA, 50L)), class = "data.frame", row.names = c(NA, -11L))
returns <- structure(list(Student = c("Adam", "Char", "Fred", "Greg", "Ed", "Mick", "Dave", "Nick", "Tim", "George", "Tom"),
Class = c(1L, 1L, 1L, 1L, 2L, 2L, 3L, 3L, 3L, 3L, 3L), Jan_20_score = c(NA, 5L, -7L, 2L, 1L, NA, 5L, 8L, -2L, 5L, NA),
Feb_20_score = c(2L, 0, 8L, NA, 2L, 6L, NA, 8L, 7L, 3L, 8L), Jan_20_Weight = c(150L, 30L, NA, 80L, 60L, 80L, 40L, 12L, 23L, 65L, 78L),
Feb_20_Weight = c(153L, 60L, 80L, 40L, 80L, 30L, 25L, 45L, 40L, NA, 50L)), class = "data.frame", row.names = c(NA, -11L))
I am using lapply to remove some observations, I would like to do this across all my dataframes and keep the output as dataframes, basically update the existing dataframes and remove the observations I select.
Here is my current code.
df.list <- list(flows, returns)
lapply(df.list, function(df) df[!grepl("1", df$Class),])
However, when I do this the output is not updating the original dataframes and is outputting as a list in the global environment. Any help is appreciated.
Another solution:
flows <- structure(list(Student = c("Adam", "Char", "Fred", "Greg", "Ed", "Mick", "Dave", "Nick", "Tim", "George", "Tom"),
Class = c(1L, 1L, 1L, 1L, 2L, 2L, 3L, 3L, 3L, 3L, 3L), Jan_18_score = c(NA, 5L, -7L, 2L, 1L, NA, 5L, 8L, -2L, 5L, NA),
Feb_18_score = c(2L, 0, 8L, NA, 2L, 6L, NA, 8L, 7L, 3L, 8L), Jan_18_Weight = c(150L, 30L, NA, 80L, 60L, 80L, 40L, 12L, 23L, 65L, 78L),
Feb_18_Weight = c(153L, 60L, 80L, 40L, 80L, 30L, 25L, 45L, 40L, NA, 50L)), class = "data.frame", row.names = c(NA, -11L))
returns <- structure(list(Student = c("Adam", "Char", "Fred", "Greg", "Ed", "Mick", "Dave", "Nick", "Tim", "George", "Tom"),
Class = c(1L, 1L, 1L, 1L, 2L, 2L, 3L, 3L, 3L, 3L, 3L), Jan_20_score = c(NA, 5L, -7L, 2L, 1L, NA, 5L, 8L, -2L, 5L, NA),
Feb_20_score = c(2L, 0, 8L, NA, 2L, 6L, NA, 8L, 7L, 3L, 8L), Jan_20_Weight = c(150L, 30L, NA, 80L, 60L, 80L, 40L, 12L, 23L, 65L, 78L),
Feb_20_Weight = c(153L, 60L, 80L, 40L, 80L, 30L, 25L, 45L, 40L, NA, 50L)), class = "data.frame", row.names = c(NA, -11L))
df.list <- list(flows, returns)
Now, we need to assign lapply to some value and name it:
a <- lapply(df.list, function(df) df[!grepl("1", df$Class),])
names(a) <- c("flows","returns")
After this, we call list2env function:
list2env(a, envir = .GlobalEnv)
Output:
> flows
Student Class Jan_18_score Feb_18_score Jan_18_Weight Feb_18_Weight
5 Ed 2 1 2 60 80
6 Mick 2 NA 6 80 30
7 Dave 3 5 NA 40 25
8 Nick 3 8 8 12 45
9 Tim 3 -2 7 23 40
10 George 3 5 3 65 NA
11 Tom 3 NA 8 78 50
> returns
Student Class Jan_20_score Feb_20_score Jan_20_Weight Feb_20_Weight
5 Ed 2 1 2 60 80
6 Mick 2 NA 6 80 30
7 Dave 3 5 NA 40 25
8 Nick 3 8 8 12 45
9 Tim 3 -2 7 23 40
10 George 3 5 3 65 NA
11 Tom 3 NA 8 78 50
Checking classes of the outputs:
> class(returns)
[1] "data.frame"
> class(flows)
[1] "data.frame"
I'm not sure about using lapply but you can work with lists of variables by name using get and assign.
flows <- structure(list(Student = c("Adam", "Char", "Fred", "Greg", "Ed", "Mick", "Dave", "Nick", "Tim", "George", "Tom"),
Class = c(1L, 1L, 1L, 1L, 2L, 2L, 3L, 3L, 3L, 3L, 3L), Jan_18_score = c(NA, 5L, -7L, 2L, 1L, NA, 5L, 8L, -2L, 5L, NA),
Feb_18_score = c(2L, 0, 8L, NA, 2L, 6L, NA, 8L, 7L, 3L, 8L), Jan_18_Weight = c(150L, 30L, NA, 80L, 60L, 80L, 40L, 12L, 23L, 65L, 78L),
Feb_18_Weight = c(153L, 60L, 80L, 40L, 80L, 30L, 25L, 45L, 40L, NA, 50L)), class = "data.frame", row.names = c(NA, -11L))
returns <- structure(list(Student = c("Adam", "Char", "Fred", "Greg", "Ed", "Mick", "Dave", "Nick", "Tim", "George", "Tom"),
Class = c(1L, 1L, 1L, 1L, 2L, 2L, 3L, 3L, 3L, 3L, 3L), Jan_20_score = c(NA, 5L, -7L, 2L, 1L, NA, 5L, 8L, -2L, 5L, NA),
Feb_20_score = c(2L, 0, 8L, NA, 2L, 6L, NA, 8L, 7L, 3L, 8L), Jan_20_Weight = c(150L, 30L, NA, 80L, 60L, 80L, 40L, 12L, 23L, 65L, 78L),
Feb_20_Weight = c(153L, 60L, 80L, 40L, 80L, 30L, 25L, 45L, 40L, NA, 50L)), class = "data.frame", row.names = c(NA, -11L))
df.list <- list("flows", "returns")
for (df.name in df.list){
temp <- get(df.name)
temp <- temp[!grepl("1", temp$Class), ]
assign(paste0(df.name, "_new"), temp)
}
Remove "_new" to overwrite the original variables.
I have a txt.file like this:
0003 MPARTNER SALZ S 150112 22:30:45 160304 08:38:13 2 BUY 2 BUY 12380 165426 150109 08:00:00
0003 SPROTTSE HUGHES S 140407 02:30:50 141120 13:55:06 2 BUY 2 BUY 3764 57379 140401 10:05:00
0003 SPROTTSE HUGHES S 141223 09:06:13 160715 08:42:56 3 MARKETPERFORM 3 HOLD 3764 57379 141223 08:02:00
001V MPARTNER PEARLSTEIN D 140821 02:44:05 150312 09:17:13 2 BUY 2 BUY 12380 163717 140820 08:16:00
001V MPARTNER PEARLSTEIN D 151016 15:07:40 160411 08:40:35 2 BUY 2 BUY 12380 163717 151009 08:12:00
001W CANACCOR K 140321 04:06:40 140609 23:06:44 SPECULATIVE BUY 1 STRONG BUY 406 150412 140319 23:19:00
001W CANACCOR WRIGHT K 140714 12:47:31 160228 22:57:45 BUY 1 STRONG BUY 406 150412 140714 12:38:00
001W CLARUS OFIR E 140515 11:40:00 150515 09:27:09 SPECULATIVE BUY 1 STRONG BUY 202 115944 140515 11:40:00
001W CLARUS MACKAY D 150813 09:40:45 160812 09:40:02 BUY 1 STRONG BUY 202 73763 150813 09:23:00
001W DEACON OFIR E 150119 22:03:46 170328 06:45:14 1 BUY 1 STRONG BUY 704 115944 150112 07:24:00
001W DEACON OFIR E 171115 06:48:47 171115 06:48:47 1 BUY 1 STRONG BUY 704 115944 171115 06:42:00
#70L MORGAN MARTINEZ J 100226 07:12:51 100708 04:51:16 8 EQUALWT/NO RATING 3 HOLD 1595 56947 100226 07:12:00
#70L MORGAN MARTINEZ DE O J 100708 05:09:02 100910 00:48:28 6 EQUALWT/IN-LINE 3 HOLD 1595 56947 100708 03:14:00
#70L MORGAN MARTINEZ DE O J 100910 21:16:07 101110 21:55:52 2 OVERWT/IN-LINE 2 BUY 1595 56947 100910 19:18:00
#70L MORGAN OLCOZ CERDAN J 101112 01:32:41 120618 21:04:56 2 OVERWT/IN-LINE 2 BUY 1595 56947 101111 20:03:00
#70L MORGAN OLCOZ CERDAN J 120712 03:19:26 131216 19:49:59 6 EQUALWT/IN-LINE 3 HOLD 1595 56947 120711 19:20:00
#70L MORGAN OLCOZ CERDAN J 140226 22:20:19 150417 13:07:31 2 OVERWT/IN-LINE 2 BUY 1595 56947 140226 22:20:00
#70L MORGAN J 150608 01:25:35 171106 00:16:05 1 OVERWT/ATTRACTIVE 2 BUY 1595 56947 150608 01:25:00
And I would like to produce a table in R with the same structure as in the txt file with the apparent 16 columns.
I tried to use the codes:
max(count.fields("BSP.txt", sep="")) # 18 columns
df= read.delim("BSP.txt", sep = "" ,header = FALSE,col.names = c("V1", "VS","V3", "V4", "V5","V6",
"V7", "V8", "V9", "V10",
"V11", "V12", "V13", "V14",
"V15","V16","V17","V18"))
But I received a weirdly structured table:
structure(list(V1 = structure(c(2L, 2L, 2L, 3L, 3L, 4L, 4L, 4L,
4L, 4L, 4L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("#70L", "0003",
"001V", "001W"), class = "factor"), VS = structure(c(5L, 6L,
6L, 5L, 5L, 1L, 1L, 2L, 2L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L
), .Label = c("CANACCOR", "CLARUS", "DEACON", "MORGAN", "MPARTNER",
"SPROTTSE"), class = "factor"), V3 = structure(c(9L, 1L, 1L,
8L, 8L, 3L, 10L, 6L, 4L, 6L, 6L, 5L, 5L, 5L, 7L, 7L, 7L, 2L), .Label = c("HUGHES",
"J", "K", "MACKAY", "MARTINEZ", "OFIR", "OLCOZ", "PEARLSTEIN",
"SALZ", "WRIGHT"), class = "factor"), V4 = structure(c(9L, 9L,
9L, 4L, 4L, 1L, 8L, 6L, 4L, 6L, 6L, 7L, 5L, 5L, 3L, 3L, 3L, 2L
), .Label = c("140321", "150608", "CERDAN", "D", "DE", "E", "J",
"K", "S"), class = "factor"), V5 = structure(c(9L, 4L, 8L, 7L,
12L, 2L, 6L, 5L, 11L, 10L, 13L, 3L, 15L, 15L, 14L, 14L, 14L,
1L), .Label = c("01:25:35", "04:06:40", "100226", "140407", "140515",
"140714", "140821", "141223", "150112", "150119", "150813", "151016",
"171115", "J", "O"), class = "factor"), V6 = structure(c(16L,
1L, 5L, 2L, 13L, 12L, 9L, 8L, 6L, 15L, 3L, 4L, 17L, 17L, 7L,
10L, 11L, 14L), .Label = c("02:30:50", "02:44:05", "06:48:47",
"07:12:51", "09:06:13", "09:40:45", "101112", "11:40:00", "12:47:31",
"120712", "140226", "140609", "15:07:40", "171106", "22:03:46",
"22:30:45", "J"), class = "factor"), V7 = structure(c(10L, 6L,
12L, 7L, 11L, 17L, 9L, 8L, 13L, 14L, 15L, 4L, 4L, 5L, 2L, 3L,
16L, 1L), .Label = c("00:16:05", "01:32:41", "03:19:26", "100708",
"100910", "141120", "150312", "150515", "160228", "160304", "160411",
"160715", "160812", "170328", "171115", "22:20:19", "23:06:44"
), class = "factor"), V8 = structure(c(5L, 13L, 7L, 8L, 6L, 18L,
17L, 9L, 10L, 3L, 4L, 1L, 2L, 16L, 12L, 14L, 15L, 11L), .Label = c("04:51:16",
"05:09:02", "06:45:14", "06:48:47", "08:38:13", "08:40:35", "08:42:56",
"09:17:13", "09:27:09", "09:40:02", "1", "120618", "13:55:06",
"131216", "150417", "21:16:07", "22:57:45", "SPECULATIVE"), class = "factor"),
V9 = structure(c(6L, 6L, 8L, 6L, 6L, 10L, 10L, 12L, 10L,
1L, 1L, 9L, 2L, 3L, 7L, 5L, 4L, 11L), .Label = c("1", "100910",
"101110", "13:07:31", "19:49:59", "2", "21:04:56", "3", "8",
"BUY", "OVERWT/ATTRACTIVE", "SPECULATIVE"), class = "factor"),
V10 = structure(c(6L, 6L, 8L, 6L, 6L, 2L, 2L, 6L, 2L, 6L,
6L, 7L, 1L, 4L, 3L, 5L, 3L, 3L), .Label = c("00:48:28", "1",
"2", "21:55:52", "6", "BUY", "EQUALWT/NO", "MARKETPERFORM"
), class = "factor"), V11 = structure(c(2L, 2L, 3L, 2L, 2L,
9L, 9L, 1L, 9L, 1L, 1L, 8L, 4L, 2L, 7L, 6L, 7L, 5L), .Label = c("1",
"2", "3", "6", "BUY", "EQUALWT/IN-LINE", "OVERWT/IN-LINE",
"RATING", "STRONG"), class = "factor"), V12 = structure(c(4L,
4L, 6L, 4L, 4L, 4L, 4L, 8L, 4L, 8L, 8L, 3L, 5L, 7L, 2L, 3L,
2L, 1L), .Label = c("1595", "2", "3", "BUY", "EQUALWT/IN-LINE",
"HOLD", "OVERWT/IN-LINE", "STRONG"), class = "factor"), V13 = structure(c(1L,
5L, 5L, 1L, 1L, 6L, 6L, 8L, 3L, 8L, 8L, 9L, 4L, 2L, 8L, 9L,
8L, 7L), .Label = c("12380", "2", "202", "3", "3764", "406",
"56947", "BUY", "HOLD"), class = "factor"), V14 = structure(c(5L,
7L, 7L, 4L, 4L, 1L, 1L, 6L, 9L, 8L, 8L, 3L, 11L, 10L, 3L,
3L, 3L, 2L), .Label = c("150412", "150608", "1595", "163717",
"165426", "202", "57379", "704", "73763", "BUY", "HOLD"), class = "factor"),
V15 = structure(c(8L, 4L, 7L, 6L, 10L, 3L, 5L, 2L, 9L, 2L,
2L, 12L, 11L, 11L, 12L, 12L, 12L, 1L), .Label = c("01:25:00",
"115944", "140319", "140401", "140714", "140820", "141223",
"150109", "150813", "151009", "1595", "56947"), class = "factor"),
V16 = structure(c(2L, 7L, 3L, 5L, 4L, 16L, 10L, 13L, 6L,
14L, 15L, 8L, 17L, 17L, 9L, 11L, 12L, 1L), .Label = c("",
"08:00:00", "08:02:00", "08:12:00", "08:16:00", "09:23:00",
"10:05:00", "100226", "101111", "12:38:00", "120711", "140226",
"140515", "150112", "171115", "23:19:00", "56947"), class = "factor"),
V17 = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 7L, 1L, 4L,
2L, 3L, 5L, 6L, 9L, 8L, 10L, 1L), .Label = c("", "06:42:00",
"07:12:00", "07:24:00", "100708", "100910", "11:40:00", "19:20:00",
"20:03:00", "22:20:00"), class = "factor"), V18 = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 3L, 1L, 1L,
1L, 1L), .Label = c("", "03:14:00", "19:18:00"), class = "factor")), .Names = c("V1",
"VS", "V3", "V4", "V5", "V6", "V7", "V8", "V9", "V10", "V11",
"V12", "V13", "V14", "V15", "V16", "V17", "V18"), class = "data.frame", row.names = c(NA,
-18L))
As stated above, I would like to receive a table with 16 columns with the structure in the txt.file. Even the empty fields (e.g. in Row 6) should remain.
E.g for Row 6:
Can you help me on this?
many thanks.
One option is to use read.fwf
df <- read.fwf("tst.txt", widths = c(8, 10, 14, 28, 7, 10, 7, 10, 7, 29, 3,
21, 9, 8, 7, 8), header = FALSE)
#Now next part will be to remove the leading/training whitespaces from character fields.
library(dplyr)
df <- df %>% mutate_if(is.factor, function(x)trimws(as.character(x)))
The data frame looks as:
df
# V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15 V16
# 1 0003 MPARTNER SALZ S 150112 22:30:45 160304 08:38:13 2 BUY 2 BUY 12380 165426 150109 08:00:00
# 2 0003 SPROTTSE HUGHES S 140407 02:30:50 141120 13:55:06 2 BUY 2 BUY 3764 57379 140401 10:05:00
# 3 0003 SPROTTSE HUGHES S 141223 09:06:13 160715 08:42:56 3 MARKETPERFORM 3 HOLD 3764 57379 141223 08:02:00
# 4 001V MPARTNER PEARLSTEIN D 140821 02:44:05 150312 09:17:13 2 BUY 2 BUY 12380 163717 140820 08:16:00
# 5 001V MPARTNER PEARLSTEIN D 151016 15:07:40 160411 08:40:35 2 BUY 2 BUY 12380 163717 151009 08:12:00
# 6 001W CANACCOR K 140321 04:06:40 140609 23:06:44 NA SPECULATIVE BUY 1 STRONG BUY 406 150412 140319 23:19:00
# 7 001W CANACCOR WRIGHT K 140714 12:47:31 160228 22:57:45 NA BUY 1 STRONG BUY 406 150412 140714 12:38:00
# 8 001W CLARUS OFIR E 140515 11:40:00 150515 09:27:09 NA SPECULATIVE BUY 1 STRONG BUY 202 115944 140515 11:40:00
# 9 001W CLARUS MACKAY D 150813 09:40:45 160812 09:40:02 NA BUY 1 STRONG BUY 202 73763 150813 09:23:00
# 10 001W DEACON OFIR E 150119 22:03:46 170328 06:45:14 1 BUY 1 STRONG BUY 704 115944 150112 07:24:00
# 11 001W DEACON OFIR E 171115 06:48:47 171115 06:48:47 1 BUY 1 STRONG BUY 704 115944 171115 06:42:00
# 12 #70L MORGAN MARTINEZ J 100226 07:12:51 100708 04:51:16 8 EQUALWT/NO RATING 3 HOLD 1595 56947 100226 07:12:00
# 13 #70L MORGAN MARTINEZ DE O J 100708 05:09:02 100910 00:48:28 6 EQUALWT/IN-LINE 3 HOLD 1595 56947 100708 03:14:00
# 14 #70L MORGAN MARTINEZ DE O J 100910 21:16:07 101110 21:55:52 2 OVERWT/IN-LINE 2 BUY 1595 56947 100910 19:18:00
# 15 #70L MORGAN OLCOZ CERDAN J 101112 01:32:41 120618 21:04:56 2 OVERWT/IN-LINE 2 BUY 1595 56947 101111 20:03:00
# 16 #70L MORGAN OLCOZ CERDAN J 120712 03:19:26 131216 19:49:59 6 EQUALWT/IN-LINE 3 HOLD 1595 56947 120711 19:20:00
# 17 #70L MORGAN OLCOZ CERDAN J 140226 22:20:19 150417 13:07:31 2 OVERWT/IN-LINE 2 BUY 1595 56947 140226 22:20:00
# 18 #70L MORGAN J 150608 01:25:35 171106 00:16:05 1 OVERWT/ATTRACTIVE 2 BUY 1595 56947 150608 01:25:00
The above data.frame got 16 columns and 18 rows.
I am relatively new to R; and, I need help with a user defined function. I would like to see where each observation of a data frame ranks in a subset of similar observations of the same data frame. I'm having trouble referencing the original observation, in order to extract its rank, within my function.
Here is a sample of my data:
> dput(df)
structure(list(Name = c("Alex Abrines", "Steven Adams", "Cole Aldrich",
"LaMarcus Aldridge", "Kyle Anderson", "Ryan Anderson", "Giannis Antetokounmpo",
"Carmelo Anthony", "OG Anunoby", "Darrell Arthur", "Will Barton",
"Bradley Beal", "Davis Bertans", "Nemanja Bjelica", "Malcolm Brogdon",
"Aaron Brooks", "Dillon Brooks", "Lorenzo Brown", "Sterling Brown",
"Reggie Bullock", "Jimmy Butler", "Dwight Buycks", "Clint Capela",
"Wilson Chandler", "Torrey Craig", "Jamal Crawford", "Deyonta Davis",
"Matthew Dellavedova", "DeMar DeRozan", "Gorgui Dieng", "Andre Drummond",
"James Ennis", "Kenneth Faried", "Raymond Felton", "Terrance Ferguson",
"Bryn Forbes", "Tim Frazier", "Langston Galloway", "Marc Gasol",
"Pau Gasol", "Paul George", "Marcus Georges-Hunt", "Taj Gibson",
"Manu Ginobili", "Marcin Gortat", "Jerami Grant", "Danny Green",
"Gerald Green", "JaMychal Green", "Blake Griffin", "James Harden",
"Gary Harris", "Andrew Harrison", "Myke Henry", "John Henson",
"Nene Hilario", "Darrun Hilliard", "Josh Huestis", "Serge Ibaka",
"Stanley Johnson", "Nikola Jokic", "Tyus Jones", "Luke Kennard",
"Sean Kilpatrick", "Joffrey Lauvergne", "Kyle Lowry", "Trey Lyles",
"Ian Mahinmi", "Thon Maker", "Jarell Martin", "Luc Mbah a Moute",
"Ben McLemore", "Jodie Meeks", "Khris Middleton", "Patty Mills",
"Eric Moreland", "Markieff Morris", "Emmanuel Mudiay", "Shabazz Muhammad",
"Xavier Munford", "Dejounte Murray", "Jamal Murray", "Lucas Nogueira",
"Kelly Oubre", "Tony Parker", "Patrick Patterson", "Brandon Paul",
"Chris Paul", "Marshall Plumlee", "Jakob Poeltl", "Otto Porter",
"Norman Powell", "Willie Reed", "Tomas Satoransky", "Mike Scott",
"Wayne Selden", "Pascal Siakam", "Ish Smith", "Tony Snell", "Jeff Teague",
"Anthony Tolliver", "Karl-Anthony Towns", "P.J. Tucker", "Jonas Valanciunas",
"Rashad Vaughn", "Russell Westbrook", "Andrew Wiggins", "D.J. Wilson",
"Delon Wright"), Pos = structure(c(5L, 1L, 1L, 1L, 3L, 2L, 3L,
2L, 2L, 2L, 4L, 4L, 2L, 2L, 4L, 4L, 5L, 4L, 4L, 5L, 3L, 4L, 1L,
2L, 5L, 4L, 1L, 4L, 5L, 1L, 1L, 2L, 2L, 4L, 5L, 4L, 4L, 4L, 1L,
1L, 2L, 4L, 2L, 4L, 1L, 2L, 5L, 5L, 2L, 2L, 4L, 4L, 4L, 2L, 1L,
1L, 4L, 2L, 1L, 2L, 1L, 4L, 4L, 4L, 1L, 4L, 2L, 1L, 1L, 2L, 2L,
4L, 4L, 3L, 4L, 1L, 2L, 4L, 3L, 4L, 4L, 4L, 1L, 2L, 4L, 2L, 4L,
4L, 1L, 1L, 2L, 4L, 1L, 4L, 2L, 5L, 2L, 4L, 5L, 4L, 1L, 1L, 2L,
1L, 4L, 4L, 3L, 2L, 4L), .Label = c("C", "PF", "SF", "PG", "SG"
), class = "factor"), Date = structure(c(1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "2018-02-01 *", class = "factor"),
Tm = structure(c(7L, 7L, 6L, 8L, 8L, 3L, 5L, 7L, 9L, 1L,
1L, 10L, 8L, 6L, 5L, 6L, 4L, 9L, 5L, 2L, 6L, 2L, 3L, 1L,
1L, 6L, 4L, 5L, 9L, 6L, 2L, 4L, 1L, 7L, 7L, 8L, 10L, 2L,
4L, 8L, 7L, 6L, 6L, 8L, 10L, 7L, 8L, 3L, 4L, 2L, 3L, 1L,
4L, 4L, 5L, 3L, 8L, 7L, 9L, 2L, 1L, 6L, 2L, 5L, 8L, 9L, 1L,
10L, 5L, 4L, 3L, 4L, 10L, 5L, 8L, 2L, 10L, 1L, 6L, 5L, 8L,
1L, 9L, 10L, 8L, 7L, 8L, 3L, 5L, 9L, 10L, 9L, 2L, 10L, 10L,
4L, 9L, 2L, 5L, 6L, 2L, 6L, 3L, 9L, 5L, 7L, 6L, 5L, 9L), .Label = c("DEN",
"DET", "HOU", "MEM", "MIL", "MIN", "OKC", "SAS", "TOR", "WAS"
), class = "factor"), Opp = structure(c(1L, 1L, 5L, 3L, 3L,
8L, 6L, 1L, 10L, 7L, 7L, 9L, 3L, 5L, 6L, 5L, 2L, 10L, 6L,
4L, 5L, 4L, 8L, 7L, 7L, 5L, 2L, 6L, 10L, 5L, 4L, 2L, 7L,
1L, 1L, 3L, 9L, 4L, 2L, 3L, 1L, 5L, 5L, 3L, 9L, 1L, 3L, 8L,
2L, 4L, 8L, 7L, 2L, 2L, 6L, 8L, 3L, 1L, 10L, 4L, 7L, 5L,
4L, 6L, 3L, 10L, 7L, 9L, 6L, 2L, 8L, 2L, 9L, 6L, 3L, 4L,
9L, 7L, 5L, 6L, 3L, 7L, 10L, 9L, 3L, 1L, 3L, 8L, 6L, 10L,
9L, 10L, 4L, 9L, 9L, 2L, 10L, 4L, 6L, 5L, 4L, 5L, 8L, 10L,
6L, 1L, 5L, 6L, 10L), .Label = c("DEN", "DET", "HOU", "MEM",
"MIL", "MIN", "OKC", "SAS", "TOR", "WAS"), class = "factor"),
MP = c(29L, 32L, 3L, 34L, 30L, 29L, 36L, 34L, 21L, 1L, 36L,
38L, 13L, 14L, 10L, 3L, 32L, 11L, 24L, 35L, 40L, 19L, 35L,
34L, 22L, 17L, 15L, 25L, 38L, 13L, 28L, 15L, 10L, 14L, 4L,
18L, 17L, 4L, 33L, 20L, 36L, 6L, 33L, 20L, 26L, 25L, 28L,
30L, 20L, 35L, 37L, 38L, 34L, 22L, 32L, 13L, 8L, 12L, 35L,
36L, 37L, 17L, 21L, 18L, 2L, 35L, 15L, 19L, 13L, 28L, 35L,
10L, 9L, 35L, 24L, 5L, 32L, 14L, 3L, 7L, 24L, 34L, 3L, 23L,
17L, 15L, 2L, 30L, 5L, 16L, 29L, 26L, 5L, 28L, 19L, 31L,
13L, 29L, 29L, 28L, 22L, 33L, 31L, 29L, 4L, 39L, 30L, 4L,
13L), Player.ID = structure(c(1L, 2L, 3L, 4L, 5L, 6L, 7L,
8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L,
20L, 21L, 22L, 23L, 24L, 25L, 26L, 27L, 28L, 29L, 30L, 31L,
32L, 33L, 34L, 35L, 36L, 37L, 38L, 39L, 40L, 42L, 41L, 43L,
44L, 45L, 46L, 47L, 48L, 49L, 50L, 51L, 53L, 52L, 54L, 55L,
56L, 57L, 58L, 59L, 60L, 61L, 62L, 63L, 64L, 65L, 66L, 67L,
68L, 69L, 70L, 71L, 72L, 73L, 74L, 75L, 76L, 77L, 78L, 79L,
80L, 81L, 82L, 83L, 84L, 85L, 86L, 87L, 88L, 89L, 90L, 91L,
92L, 93L, 94L, 95L, 96L, 97L, 98L, 99L, 100L, 101L, 102L,
103L, 104L, 105L, 106L, 107L, 108L, 109L), .Label = c("abrinal01",
"adamsst01", "aldrico01", "aldrila01", "anderky01", "anderry01",
"antetgi01", "anthoca01", "anunoog01", "arthuda01", "bartowi01",
"bealbr01", "bertada01", "bjeline01", "brogdma01", "brookaa01",
"brookdi01", "brownlo01", "brownst02", "bullore01", "butleji01",
"buyckdw01", "capelca01", "chandwi01", "craigto01", "crawfja01",
"davisde01", "dellama01", "derozde01", "dienggo01", "drumman01",
"ennisja01", "farieke01", "feltora01", "fergute01", "forbebr01",
"fraziti01", "gallola01", "gasolma01", "gasolpa01", "georgma01",
"georgpa01", "gibsota01", "ginobma01", "gortama01", "grantje01",
"greenda02", "greenge01", "greenja01", "griffbl01", "hardeja01",
"harrian01", "harriga01", "henrymy01", "hensojo01", "hilarne01",
"hillida01", "huestjo01", "ibakase01", "johnsst04", "jokicni01",
"jonesty01", "kennalu01", "kilpase01", "lauvejo01", "lowryky01",
"lylestr01", "mahinia01", "makerth01", "martija01", "mbahalu01",
"mclembe01", "meeksjo01", "middlkh01", "millspa02", "moreler01",
"morrima02", "mudiaem01", "muhamsh01", "munfoxa02", "murrade01",
"murraja01", "noguelu01", "oubreke01", "parketo01", "pattepa01",
"paulbr01", "paulch01", "plumlma02", "poeltja01", "porteot01",
"powelno01", "reedwi02", "satorto01", "scottmi01", "seldewa01",
"siakapa01", "smithis01", "snellto01", "teaguje01", "tollian01",
"townska01", "tuckepj01", "valanjo01", "vaughra01", "westbru01",
"wiggian01", "wilsodj01", "wrighde01"), class = "factor"),
Game.ID = structure(c(7L, 7L, 6L, 8L, 8L, 3L, 5L, 7L, 9L,
1L, 1L, 10L, 8L, 6L, 5L, 6L, 4L, 9L, 5L, 2L, 6L, 2L, 3L,
1L, 1L, 6L, 4L, 5L, 9L, 6L, 2L, 4L, 1L, 7L, 7L, 8L, 10L,
2L, 4L, 8L, 7L, 6L, 6L, 8L, 10L, 7L, 8L, 3L, 4L, 2L, 3L,
1L, 4L, 4L, 5L, 3L, 8L, 7L, 9L, 2L, 1L, 6L, 2L, 5L, 8L, 9L,
1L, 10L, 5L, 4L, 3L, 4L, 10L, 5L, 8L, 2L, 10L, 1L, 6L, 5L,
8L, 1L, 9L, 10L, 8L, 7L, 8L, 3L, 5L, 9L, 10L, 9L, 2L, 10L,
10L, 4L, 9L, 2L, 5L, 6L, 2L, 6L, 3L, 9L, 5L, 7L, 6L, 5L,
9L), .Label = c("2018-02-01 * DEN", "2018-02-01 * DET", "2018-02-01 * HOU",
"2018-02-01 * MEM", "2018-02-01 * MIL", "2018-02-01 * MIN",
"2018-02-01 * OKC", "2018-02-01 * SAS", "2018-02-01 * TOR",
"2018-02-01 * WAS"), class = "factor")), .Names = c("Name",
"Pos", "Date", "Tm", "Opp", "MP", "Player.ID", "Game.ID"), class = "data.frame", row.names = c(NA,
109L))
I would like to write a function that, for each observation:
> df[1, ]
Name Pos Date Tm Opp MP Player.ID Game.ID
1 Alex Abrines SG 2018-02-01 * OKC DEN 29 abrinal01 2018-02-01 * OKC
creates a subset of all other observations with a matching df$Game.ID.
> df[df$Game.ID == '2018-02-01 * OKC', ]
Name Pos Date Tm Opp MP Player.ID Game.ID
1 Alex Abrines SG 2018-02-01 * OKC DEN 29 abrinal01 2018-02-01 * OKC
2 Steven Adams C 2018-02-01 * OKC DEN 32 adamsst01 2018-02-01 * OKC
8 Carmelo Anthony PF 2018-02-01 * OKC DEN 34 anthoca01 2018-02-01 * OKC
34 Raymond Felton PG 2018-02-01 * OKC DEN 14 feltora01 2018-02-01 * OKC
35 Terrance Ferguson SG 2018-02-01 * OKC DEN 4 fergute01 2018-02-01 * OKC
41 Paul George PF 2018-02-01 * OKC DEN 36 georgpa01 2018-02-01 * OKC
46 Jerami Grant PF 2018-02-01 * OKC DEN 25 grantje01 2018-02-01 * OKC
58 Josh Huestis PF 2018-02-01 * OKC DEN 12 huestjo01 2018-02-01 * OKC
86 Patrick Patterson PF 2018-02-01 * OKC DEN 15 pattepa01 2018-02-01 * OKC
106 Russell Westbrook PG 2018-02-01 * OKC DEN 39 westbru01 2018-02-01 * OKC
and then returns the rank of the original observation's df$MP
> df[1, c('MP')]
[1] 29
in the hierarchy of the new subset.
> xx <- data.frame(cbind(sort(df[df$Game.ID == '2018-02-01 * OKC', c('MP')], decreasing = TRUE), rownames(data.table(sort(df[df$Game.ID == '2018-02-01 * OKC', c('MP')], decreasing = TRUE)))))
> xx
X1 X2
1 39 1
2 36 2
3 34 3
4 32 4
5 29 5
6 25 6
7 15 7
8 14 8
9 12 9
10 4 10
> colnames(xx) <- c('MP', 'Depth.Chart')
> yy <- df[df$Game.ID == '2018-02-01 * OKC', ]
> yy
Name Pos Date Tm Opp MP Player.ID
1 Alex Abrines SG 2018-02-01 * OKC DEN 29 abrinal01
2 Steven Adams C 2018-02-01 * OKC DEN 32 adamsst01
8 Carmelo Anthony PF 2018-02-01 * OKC DEN 34 anthoca01
34 Raymond Felton PG 2018-02-01 * OKC DEN 14 feltora01
35 Terrance Ferguson SG 2018-02-01 * OKC DEN 4 fergute01
41 Paul George PF 2018-02-01 * OKC DEN 36 georgpa01
46 Jerami Grant PF 2018-02-01 * OKC DEN 25 grantje01
58 Josh Huestis PF 2018-02-01 * OKC DEN 12 huestjo01
86 Patrick Patterson PF 2018-02-01 * OKC DEN 15 pattepa01
106 Russell Westbrook PG 2018-02-01 * OKC DEN 39 westbru01
Game.ID
1 2018-02-01 * OKC
2 2018-02-01 * OKC
8 2018-02-01 * OKC
34 2018-02-01 * OKC
35 2018-02-01 * OKC
41 2018-02-01 * OKC
46 2018-02-01 * OKC
58 2018-02-01 * OKC
86 2018-02-01 * OKC
106 2018-02-01 * OKC
> zz <- merge(yy, xx, all.x = TRUE)
> zz
MP Name Pos Date Tm Opp Player.ID
1 4 Terrance Ferguson SG 2018-02-01 * OKC DEN fergute01
2 12 Josh Huestis PF 2018-02-01 * OKC DEN huestjo01
3 14 Raymond Felton PG 2018-02-01 * OKC DEN feltora01
4 15 Patrick Patterson PF 2018-02-01 * OKC DEN pattepa01
5 25 Jerami Grant PF 2018-02-01 * OKC DEN grantje01
6 29 Alex Abrines SG 2018-02-01 * OKC DEN abrinal01
7 32 Steven Adams C 2018-02-01 * OKC DEN adamsst01
8 34 Carmelo Anthony PF 2018-02-01 * OKC DEN anthoca01
9 36 Paul George PF 2018-02-01 * OKC DEN georgpa01
10 39 Russell Westbrook PG 2018-02-01 * OKC DEN westbru01
Game.ID Depth.Chart
1 2018-02-01 * OKC 10
2 2018-02-01 * OKC 9
3 2018-02-01 * OKC 8
4 2018-02-01 * OKC 7
5 2018-02-01 * OKC 6
6 2018-02-01 * OKC 5
7 2018-02-01 * OKC 4
8 2018-02-01 * OKC 3
9 2018-02-01 * OKC 2
10 2018-02-01 * OKC 1
Finally, I need to extract the value of zz$Depth.Chart that corresponds to the original observation, 5.
> zz[zz$MP == 29, c('Depth.Chart')]
[1] 5
Levels: 1 10 2 3 4 5 6 7 8 9
I would like to define a function that executes the laborious and messy steps above for each observation in a data frame and returns a vector of the results. How can I reference the value of df$MP that corresponds to the observation I'm working on without explicitly calling it 29, like I do above? Here are a few of the thing I've tried, unsuccessfully.
> f1 <- function(col1, df, col2){
+ lapply(col1, function(i){
+ df2 <- df[col1 == i, col2]
+ df3 <- data.frame(cbind(sort(df2, decreasing = TRUE), rownames(data.table(sort(df2, decreasing = TRUE)))))
+ df3[i, 2]
+ })}
> f1(df$Game.ID, df, c('MP'))[1:10]
[[1]]
[1] 7
Levels: 1 10 2 3 4 5 6 7 8 9
[[2]]
[1] 7
Levels: 1 10 2 3 4 5 6 7 8 9
[[3]]
[1] 6
Levels: 1 10 11 12 13 2 3 4 5 6 7 8 9
[[4]]
[1] 8
Levels: 1 10 11 12 13 2 3 4 5 6 7 8 9
[[5]]
[1] 8
Levels: 1 10 11 12 13 2 3 4 5 6 7 8 9
[[6]]
[1] 3
Levels: 1 2 3 4 5 6 7 8
[[7]]
[1] 5
Levels: 1 10 11 12 13 2 3 4 5 6 7 8 9
[[8]]
[1] 7
Levels: 1 10 2 3 4 5 6 7 8 9
[[9]]
[1] 9
Levels: 1 10 11 2 3 4 5 6 7 8 9
[[10]]
[1] 1
Levels: 1 10 2 3 4 5 6 7 8 9
> f1 <- function(col1, df, col2){
+ lapply(col1, function(i){
+ df2 <- df[col1 == i, col2]
+ df3 <- data.frame(cbind(sort(df2, decreasing = TRUE), rownames(data.table(sort(df2, decreasing = TRUE)))))
+ df3[df3$X1 == i, 2]
+ })}
> f1(df$Game.ID, df, c('MP'))
Hide Traceback
Rerun with Debug
Error in Ops.factor(df3$X1, i) : level sets of factors are different
7.
stop("level sets of factors are different")
6.
Ops.factor(df3$X1, i)
5.
`[.data.frame`(df3, df3$X1 == i, 2)
4.
df3[df3$X1 == i, 2]
3.
FUN(X[[i]], ...)
2.
lapply(col1, function(i) {
df2 <- df[col1 == i, col2]
df3 <- data.frame(cbind(sort(df2, decreasing = TRUE), rownames(data.table(sort(df2,
decreasing = TRUE))))) ...
1.
f1(df$Game.ID, df, c("MP"))
> f1 <- function(col1, df, col2){
+ lapply(col1, function(i){
+ df2 <- df[col1 == i, col2]
+ df3 <- data.frame(cbind(sort(df2, decreasing = TRUE), rownames(data.table(sort(df2, decreasing = TRUE)))))
+ df3[col2 == i, 2]
+ })}
> f1(df$Game.ID, df, c('MP'))[1:10]
[[1]]
factor(0)
Levels: 1 10 2 3 4 5 6 7 8 9
[[2]]
factor(0)
Levels: 1 10 2 3 4 5 6 7 8 9
[[3]]
factor(0)
Levels: 1 10 11 12 13 2 3 4 5 6 7 8 9
[[4]]
factor(0)
Levels: 1 10 11 12 13 2 3 4 5 6 7 8 9
[[5]]
factor(0)
Levels: 1 10 11 12 13 2 3 4 5 6 7 8 9
[[6]]
factor(0)
Levels: 1 2 3 4 5 6 7 8
[[7]]
factor(0)
Levels: 1 10 11 12 13 2 3 4 5 6 7 8 9
[[8]]
factor(0)
Levels: 1 10 2 3 4 5 6 7 8 9
[[9]]
factor(0)
Levels: 1 10 11 2 3 4 5 6 7 8 9
[[10]]
factor(0)
Levels: 1 10 2 3 4 5 6 7 8 9
I guess I don't fully understand how R treats this i variable inside the function; or, therefore, how reference it appropriately. In looking through this forum, I found generic examples of nesting functions inside of functions in Python but not in R. Any help would be much appreciated.
EDIT
Here is a simpler subset of my data:
> dput(df)
structure(list(MP = c(29L, 32L, 3L, 34L, 14L, 3L, 40L, 17L, 13L,
14L, 4L, 36L, 6L, 33L, 25L, 12L, 17L, 3L, 15L, 28L, 33L, 39L,
30L), Player.ID = structure(c(1L, 2L, 3L, 8L, 14L, 16L, 21L,
26L, 30L, 34L, 35L, 42L, 41L, 43L, 46L, 58L, 62L, 79L, 86L, 100L,
102L, 106L, 107L), .Label = c("abrinal01", "adamsst01", "aldrico01",
"aldrila01", "anderky01", "anderry01", "antetgi01", "anthoca01",
"anunoog01", "arthuda01", "bartowi01", "bealbr01", "bertada01",
"bjeline01", "brogdma01", "brookaa01", "brookdi01", "brownlo01",
"brownst02", "bullore01", "butleji01", "buyckdw01", "capelca01",
"chandwi01", "craigto01", "crawfja01", "davisde01", "dellama01",
"derozde01", "dienggo01", "drumman01", "ennisja01", "farieke01",
"feltora01", "fergute01", "forbebr01", "fraziti01", "gallola01",
"gasolma01", "gasolpa01", "georgma01", "georgpa01", "gibsota01",
"ginobma01", "gortama01", "grantje01", "greenda02", "greenge01",
"greenja01", "griffbl01", "hardeja01", "harrian01", "harriga01",
"henrymy01", "hensojo01", "hilarne01", "hillida01", "huestjo01",
"ibakase01", "johnsst04", "jokicni01", "jonesty01", "kennalu01",
"kilpase01", "lauvejo01", "lowryky01", "lylestr01", "mahinia01",
"makerth01", "martija01", "mbahalu01", "mclembe01", "meeksjo01",
"middlkh01", "millspa02", "moreler01", "morrima02", "mudiaem01",
"muhamsh01", "munfoxa02", "murrade01", "murraja01", "noguelu01",
"oubreke01", "parketo01", "pattepa01", "paulbr01", "paulch01",
"plumlma02", "poeltja01", "porteot01", "powelno01", "reedwi02",
"satorto01", "scottmi01", "seldewa01", "siakapa01", "smithis01",
"snellto01", "teaguje01", "tollian01", "townska01", "tuckepj01",
"valanjo01", "vaughra01", "westbru01", "wiggian01", "wilsodj01",
"wrighde01"), class = "factor"), Game.ID = structure(c(7L, 7L,
6L, 7L, 6L, 6L, 6L, 6L, 6L, 7L, 7L, 7L, 6L, 6L, 7L, 7L, 6L, 6L,
7L, 6L, 6L, 7L, 6L), .Label = c("2018-02-01 * DEN", "2018-02-01 * DET",
"2018-02-01 * HOU", "2018-02-01 * MEM", "2018-02-01 * MIL", "2018-02-01 * MIN",
"2018-02-01 * OKC", "2018-02-01 * SAS", "2018-02-01 * TOR", "2018-02-01 * WAS"
), class = "factor")), .Names = c("MP", "Player.ID", "Game.ID"
), row.names = c(1L, 2L, 3L, 8L, 14L, 16L, 21L, 26L, 30L, 34L,
35L, 41L, 42L, 43L, 46L, 58L, 62L, 79L, 86L, 100L, 102L, 106L,
107L), class = "data.frame")
You're using data.table for little steps in your process, but you should just use it for the whole thing. It's very convenient for doing operations "by group", in this case using rank() by Game.ID. Using your small sample data:
library(data.table)
setDT(df)
df[, Depth.Chart := rank(-MP), by = Game.ID]
df
# MP Player.ID Game.ID Depth.Chart
# 1: 29 abrinal01 2018-02-01 * OKC 5.0
# 2: 32 adamsst01 2018-02-01 * OKC 4.0
# 3: 3 aldrico01 2018-02-01 * MIN 12.0
# 4: 34 anthoca01 2018-02-01 * OKC 3.0
# 5: 14 bjeline01 2018-02-01 * MIN 8.0
# 6: 3 brookaa01 2018-02-01 * MIN 12.0
# 7: 40 butleji01 2018-02-01 * MIN 1.0
# 8: 17 crawfja01 2018-02-01 * MIN 6.5
# 9: 13 dienggo01 2018-02-01 * MIN 9.0
# 10: 14 feltora01 2018-02-01 * OKC 8.0
# 11: 4 fergute01 2018-02-01 * OKC 10.0
# 12: 36 georgpa01 2018-02-01 * OKC 2.0
# 13: 6 georgma01 2018-02-01 * MIN 10.0
# 14: 33 gibsota01 2018-02-01 * MIN 2.5
# 15: 25 grantje01 2018-02-01 * OKC 6.0
# 16: 12 huestjo01 2018-02-01 * OKC 9.0
# 17: 17 jonesty01 2018-02-01 * MIN 6.5
# 18: 3 muhamsh01 2018-02-01 * MIN 12.0
# 19: 15 pattepa01 2018-02-01 * OKC 7.0
# 20: 28 teaguje01 2018-02-01 * MIN 5.0
# 21: 33 townska01 2018-02-01 * MIN 2.5
# 22: 39 westbru01 2018-02-01 * OKC 1.0
# 23: 30 wiggian01 2018-02-01 * MIN 4.0
# MP Player.ID Game.ID Depth.Chart
rank, by default, averages ties, but see ?rank for other options.
I've made a few experiments and each experiment led to the apparition of color.
As I can't do more experiments, I want to sample by size=30 and see what frequency table (of colors) I could obtain for 1000 sampling. The resulting frequency table should be the sum of the 1000 frequency table.
I think about concatenating table as follows and try to agregate, but it did not work:
mydata=structure(list(Date = structure(c(11L, 1L, 9L, 9L, 10L, 1L, 2L,
3L, 4L, 4L, 5L, 5L, 5L, 5L, 5L, 6L, 7L, 4L, 4L, 4L, 6L, 6L, 11L,
5L, 4L, 7L, 10L, 6L, 6L, 2L, 5L, 7L, 11L, 1L, 9L, 11L, 11L, 11L,
1L, 1L), .Label = c("01/02/2016", "02/02/2016", "03/02/2016",
"08/02/2016", "10/02/2016", "11/02/2016", "16/02/2016", "22/02/2016",
"26/01/2016", "27/01/2016", "28/01/2016"), class = "factor"),
Color = structure(c(30L, 33L, 11L, 1L, 18L, 18L, 11L,
16L, 19L, 19L, 22L, 1L, 18L, 18L, 13L, 14L, 13L, 18L, 24L,
24L, 11L, 24L, 2L, 33L, 25L, 1L, 30L, 5L, 24L, 18L, 13L,
35L, 19L, 19L, 18L, 23L, 19L, 8L, 19L, 14L), .Label = c("ARD",
"ARP", "BBB", "BIE", "CFX", "CHR", "DDD", "DOO", "EAU", "ELY",
"EPI", "ETR", "GEN", "GER", "GGG", "GIS", "ISE", "JUV", "LER",
"LES", "LON", "LYR", "MON", "NER", "NGY", "NOJ", "NYO", "ORI",
"PEO", "RAY", "RRR", "RSI", "SEI", "SEP", "VIL", "XQU", "YYY",
"ZYZ"), class = "factor"), Categorie = structure(c(1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("1", "1,2", "1,2,3",
"1,3", "2", "2,3", "3", "4", "5"), class = "factor"), Portion_Longueur = c(3L,
4L, 1L, 1L, 2L, 4L, 5L, 6L, 7L, 7L, 8L, 8L, 9L, 8L, 8L, 9L,
11L, 7L, 7L, 7L, 9L, 8L, 3L, 8L, 7L, 11L, 2L, 9L, 8L, 5L,
8L, 12L, 3L, 4L, 1L, 3L, 3L, 3L, 4L, 5L)), .Names = c("Date",
"Color", "Categorie", "Portion_Longueur"), row.names = c(NA,
40L), class = "data.frame")
for (i in 1:1000) {
mysamp= sample(mydata$Color,size=30)
x=data.frame(table(mysamp))
if (i==1) w=x
else w <- c(w, x)
}
aggregate(w$Freq, by=list(Color=w$mysamp), FUN=sum)
Example, for 3 sampling, for (i in 1:3) I expect have sum as follow :
But I do not have Sum, instead I have:
Color x
1 ARD 2
2 ARP 1
3 BBB 0
4 BIE 0
5 CFX 0
6 CHR 0
7 DDD 0
8 DOO 1
9 EAU 0
10 ELY 0
11 EPI 3
12 ETR 0
13 GEN 2
14 GER 2
15 GGG 0
16 GIS 1
17 ISE 0
18 JUV 4
19 LER 5
20 LES 0
21 LON 0
22 LYR 1
23 MON 1
24 NER 2
25 NGY 1
26 NOJ 0
27 NYO 0
28 ORI 0
29 PEO 0
30 RAY 1
31 RRR 0
32 RSI 0
33 SEI 2
34 SEP 0
35 VIL 1
36 XQU 0
37 YYY 0
38 ZYZ 0
How to do this ?
Thanks a lot
Your for loop is what's causing your issues. You end up creating a big list that is somewhat difficult to perform calculations on (check out names(w) to see what I mean). A better data structure would allow for easier calculations:
x = NULL #initialize
for (i in 1:1000) {
mysamp = sample(mydata$Color,size=30) #sample
mysamp = data.frame(table(mysamp)) #frequency
x = rbind(x, mysamp) #bind to x
}
aggregate(Freq~mysamp, data = x, FUN = sum) #perform calculation
Note that this loop runs a bit slower than your loop. This is because of the rbind() function. See this post. Maybe someone will come along with a more efficient solution.