Introduction
Hi to everyone,
for a little project, I try to get a function to compare values of a Data Frame 1 with values from a Data Frame 2. Thereafter, data frames 3 and 4 are supposed to get printed with the information of the comparison.
Data Frame 1:
ID
x1i
x2i
x3i
a
1
2
4
b
1
4
1
Data Frame 2:
Data_Frame_2 <- c(1:4)
Read x1a and compare with Data Frame 2. The value 1 is in Data Frame 2. Print value 1 and the name of the variable (x1a) in Data Frame 3 and cross out the value 1 from Data Frame 2.
Read x1b and compare with Data Frame 2. The value 1 is (not anymore) in Data Frame 2. Read x2b. The value 4 is in Data Frame 2. Print value 4 and the name of the variable (x2b) in Data Frame 3 and cross out the value 4 from Data Frame 2.
The Data Frame 3 is supposed to be something like this:
Data Frame 3:
ID
Value
Variable
a
1
x1i
b
4
x2i
Data Frame 4 (the remaining numbers of Data Frame 2):
Remaining numbers
2
3
Example in R to solve this theoretical problem
Until now, I worked out this code which does the job:
b <- as.data.frame(c(1:4)) # data frame 2
colnames(b, do.NULL = FALSE)
colnames(b) <- c("b")
View(b)
a <- as.data.frame(cbind(c("a","b"), c(3,3), c(2,1), c(1,2))) # data frame 1
colnames(a, do.NULL = FALSE)
colnames(a) <- c("ID","x1i","x2i","x3i")
View(a)
`%notin%` <- Negate(`%in%`) #got this one from <https://www.marsja.se/how-to-use-in-in-r/>
Read_Info <- function(a,b)
{
if (a[1,2] %in% b[1:4,1]) {c_1<-c(a[1,1:2],names(a)[2]); b1<-subset(b,b %notin% a[1,2])}
if (a[2,2] %in% b1[1:3,1]) {c_2<-c(a[2,1:2],names(a)[2]); b2<-subset(b,b %notin% c(a[1,2],a[2,2]))}
else if (a[2,3] %in% b1[1:3,1]) {c_2<-c(a[2,1],a[2,3],names(a)[3]); b2<-subset(b,b %notin% c(a[1,2],a[2,3]))}
if (a[3,2] %in% b2[1:2,1]) {c_3<-c(a[3,1],a[3,2],names(a)[2]); b3<-subset(b,b %notin% c(a[1,2],a[2,3],a[3,2]))}
else if (a[3,2] %notin% b2[1:2,1]) {c_3<-c(NA,NA,NA); b3<-b2}
c<-rbind(c_1,c_2,c_3)
colnames(c, do.NULL = FALSE)
colnames(c) <- c("ID","Value","Variable")
bx<-b3
colnames(bx, do.NULL = FALSE)
colnames(bx) <- c("Remaining numbers")
print(c)
print(bx)
}
Read_Info(a,b)
# In this example, c is data frame 3 and bx is data frame 4
Actual Task at hand - If, else if Loop Function in R
I do face the following obstacle: the actual data which I have is a little bit larger than the above example. Nevertheless, it follows the same structure:
b <- as.data.frame(c(1:20)) # this would be Data Frame 2 in the theoretical considerations
colnames(l, do.NULL = FALSE)
colnames(l) <- c("b")
View(l)
# This would be data frame 1 in the theoretical considerations
# Note: between "ID" and "x1i", there are now two additional variables which were not in the example above
# Although these two variables are part of the data, they are not of interest right know
a2 <- cbind(c("a","b","c","d","e","f","g","h","i","j","k","l","m","n","o","p","q","r","s","t"),c(0),c(1))
a1 <- data.frame(replicate(16,sample(1:20,rep=T)))
a <- cbind(a2, a1)
colnames(a, do.NULL = FALSE)
colnames(a) <- c("ID","variable1","variable2","x1i","x2i","x3i","x4i","x5i","x6i","x7i","x8i","x9i","x10i","x11i","x12i","x13i","x14i")
View(a)
I try to create an “if”, “else if” loop function utilizing "for" which is supposed to do this reading task by itself. Until now, I wrote down the following code which does not work yet.
`%notin%` <- Negate(`%in%`) # got this one from <https://www.marsja.se/how-to-use-in-in-r/>
Read_Info_Loop <- function(a,b)
{for (i in 1:20)
{ if (a[i,4] %in% b[1:(21-i),1]) {x[i]<-c(a[i,1],a[i,4],names(a)[4]); b[i]<-subset(b,b %notin% a[i,4])}
if (a[i,5] %in% b[i-1][1:(21-i),1]) {x[i]<-c(a[i,1],a[i,5],names(a)[5]); b[i]<-subset(b,b %notin% c(a[1,4],a[i,5]))
} else if (a[i,6] %in% b[i-1][1:(21-i),1]) {x[i]<-c(a[i,1],a[i,6],names(a)[6]); b[i]<-subset(b,b %notin% c(a[1,4],a[i,6]))
} else if (a[i,7] %in% b[i-1][1:(21-i),1]) {x[i]<-c(a[i,1],a[i,7],names(a)[7]); b[i]<-subset(b,b %notin% c(a[1,4],a[i,7]))
} else if (a[i,8] %in% b[i-1][1:(21-i),1]) {x[i]<-c(a[i,1],a[i,8],names(a)[8]); b[i]<-subset(b,b %notin% c(a[1,4],a[i,8]))
} else if (a[i,9] %in% b[i-1][1:(21-i),1]) {x[i]<-c(a[i,1],a[i,9],names(a)[9]); b[i]<-subset(b,b %notin% c(a[1,4],a[i,9]))
} else if (a[i,10] %in% b[i-1][1:(21-i),1]) {x[i]<-c(a[i,1],a[i,10],names(a)[10]); b[i]<-subset(b,b %notin% c(a[1,4],a[i,10]))
} else if (a[i,11] %in% b[i-1][1:(21-i),1]) {x[i]<-c(a[i,1],a[i,11],names(a)[11]); b[i]<-subset(b,b %notin% c(a[1,4],a[i,11]))
} else if (a[i,12] %in% b[i-1][1:(21-i),1]) {x[i]<-c(a[i,1],a[i,12],names(a)[12]); b[i]<-subset(b,b %notin% c(a[1,4],a[i,12]))
} else if (a[i,13] %in% b[i-1][1:(21-i),1]) {x[i]<-c(a[i,1],a[i,13],names(a)[13]); b[i]<-subset(b,b %notin% c(a[1,4],a[i,13]))
} else if (a[i,14] %in% b[i-1][1:(21-i),1]) {x[i]<-c(a[i,1],a[i,14],names(a)[14]); b[i]<-subset(b,b %notin% c(a[1,4],a[i,14]))
} else if (a[i,15] %in% b[i-1][1:(21-i),1]) {x[i]<-c(a[i,1],a[i,15],names(a)[15]); b[i]<-subset(b,b %notin% c(a[1,4],a[i,15]))
} else if (a[i,16] %in% b[i-1][1:(21-i),1]) {x[i]<-c(a[i,1],a[i,16],names(a)[16]); b[i]<-subset(b,b %notin% c(a[1,4],a[i,16]))
} else if (a[i,17] %in% b[i-1][1:(21-i),1]) {x[i]<-c(a[i,1],a[i,17],names(a)[17]); b[i]<-subset(b,b %notin% c(a[1,4],a[i,17]))
} else if (a[i,17] %notin% b[1:(21-i),1]) {x[i]<-c(NA,NA,NA); b[i]<-c(b[i-1])}
y<-rbind(x[i[1:20]])
colnames(y, do.NULL = FALSE)
colnames(y) <- c("ID","Value","Variable")
u<-rbind(b[i=20])
colnames(u, do.NULL = FALSE)
colnames(u) <- c("Remaining numbers")
print(y)
print(u)
}
}
# y is supposed to be data frame 3 and u is supposed to be data frame 4
# in the above theoretical considerations
Errors
I now get the following errors:
Error in `[<-.data.frame`(`*tmp*`, i, value = c("a", "1", "x3i")) :
replacement has 3 rows, data has 4
Error in Read_Info_Loop(test, l) : object 'x' not found
...nevertheless, the first error, I got yesterday. Today, after restarting R, the second error occurred which seems to address internal structural problems of the function code. Additionally, I am pretty sure, that there might be further errors which are right now "hidden" behind the other errors and which will occur as soon as the two above mentioned errors are dealt with.
However, I do not want you to just solve any problems. I rather would like to ask, if you have ideas how I can solve these two specific errors, and maybe a hint to just get the function a little bit closer to work properly. So, for me the focus is clearly on learning a thing or two in general.
A few disclaimers: I have little experience in programming, so the code or my descriptions are probably rather messy. Therefore, if you have any questions for clarification, please feel free to ask. I try to respond as quickly as possible. English is not my first language, so please excuse me for any language mistakes.
I am looking forward to learning and hear your ideas about the code itself, ideas regarding the theoretical considerations or the approach to the loop function.
Kind Regards
Paul
Edits / Progression
Edit: I just realized, that the code can already be simplified with another "for". Nevertheless, I read that one should rather avoid nested "for" loops (for...for...)
`%notin%` <- Negate(`%in%`) #got this one from <https://www.marsja.se/how-to-use-in-in-r/>
Read_Info_Loop2 <- function(a,b)
{for (i in 1:20) for (k in 5:17) {
{ if (a[i,4] %in% b[1:(21-i),1]) {x[i]<-c(a[i,1],a[i,4],names(a)[4]); b[i]<-subset(b,b %notin% a[i,4])
} else if (a[i,k] %in% b[i-1][1:(21-i),1]) {x[i]<-c(a[i,1],a[i,k],names(a)[k]); b[i]<-subset(b,b %notin% c(a[1,4],a[i,k]))
} else if (a[i,k] %notin% b[1:(21-i),1]) {x[i]<-c(NA,NA,NA); b[i]<-c(b[i-1])}
}
y<-rbind(x[i[1:20]])
colnames(y, do.NULL = FALSE)
colnames(y) <- c("ID","Value","Variable")
u<-rbind(b[i=20])
colnames(u, do.NULL = FALSE)
colnames(u) <- c("Remaining numbers")
print(y)
print(u)
}
}
The same error was shown:
Error in Read_Info_Loop2(test, l) : object 'x' not found
I try to use this resource, going forward: https://cran.r-project.org/doc/manuals/r-release/R-intro.html#Repetitive-execution
I am going to give further updates.
This is a tricky one. I was able to find a solution for the underlying problem but unfortunately I wasn't able to fix OP's code as it was requested.
However, here is my solution:
library(data.table)
long <- melt(setDT(a), "ID", patterns("^x"))
df3 <- long[, {
if (any(.SD$value %in% b)) {
result <- first(.SD[value %in% b])
b <- setdiff(b, result$value)
} else {
result <- data.table(variable = NA_integer_, value = NA_integer_)
}
result
}, by = ID]
df3
ID variable value
1: a x1i 1
2: b x2i 4
# remaining values
df4 <- data.table(Remaining.numbers = setdiff(b, df3$value))
df4
Remaining.numbers
1: 2
2: 3
Explanation
In a first step, the dataset a is reshaped into long format
long
ID variable value
1: a x1i 1
2: b x1i 1
3: a x2i 2
4: b x2i 4
5: a x3i 4
6: b x3i 1
Now, variable contains the column names as data items which simplifies subsequent steps. Note that melt() has maintained the original order of rows and columns which is important for picking the correct values later on.
Now, we kind of loop through long by unique values of ID. This is achieved by grouping. As a speciality of data.table, we can use an arbitrary expression (enclosed in curly brackets) for aggregation.
For each ID, we check if there is at least one value still available in the vector of remaining values. If so, the first appearance is taken as resulting row. The corresponding value is removed from b which is then used in the next "iteration", i.e., the next group level.
Please note that b inside the expression (in curly brackets) is a local variable. The modified value of b is not available outside of the environment of the expression.
While testing with arbitrary datasets I have noticed that there might be situations where all numbers which belong to an ID already have been removed from remaining. To indicate this, a dummy result with NA values is returned.
So, for each ID group one row is returned which are then combined into one data.table object and assigned to df3.
df4 contains the Remaining.numbers and is created from building the set difference between b and the vector of picked values df3$value.
Note that I have tried to rewrite the code as a loop for demonstration purposes but I have given up because I found that the bookkeeping overhead wasn't worth it.
Data
For the first use case in OP's question:
a <- fread("ID x1i x2i x3i
a 1 2 4
b 1 4 1")
b <- 1:4
Other use cases with varying numbers of rows, columns, and lengths of b can be created using the code below. Please note that set.seed() is important because the created dataset a and the results df3 and df4 depend on it. For example, with set.seed(123) we can reproduce the situation where the list of remaining numbers for the last ID is exhausted.
# number of rows and columns to create
n_rows <- 18
n_cols <- 16
# create vector b
b <- 1:20
# create data.frame a
a2 <- data.frame(ID = letters[seq(n_rows)], variable1 = 0, variable2 = 1)
set.seed(123) # to ensure reproducible results
a1 <- as.data.frame(replicate(n_cols, sample(b, n_rows, replace = TRUE)))
colnames(a1) <- sprintf("x%ii", seq(n_cols))
a <- cbind(a2, a1)
Uwe’s Solution
Thank you very much, Uwe, for your solution and comprehensive explanation! It did not even occur to me, to combine the values into one list and to let the function run over that list. So, your solution opened a new perspective on the data. I am going to try out your solution in detail to learn as much as possible and report back here as soon as possible!
Solution regarding the original code
I was able to get to a solution for the original code which took quite some time.
test2 <- cbind(c("a","b","c","d","e","f","g","h","i","j","k","l","m","n","o","p","q","r","s","t"),c(0),c(1),c(1,1,1,sample(1:15),1,1),c(2,3,3,sample(1:15),2,3))
test1 <- data.frame(replicate(12,sample(1:20,rep=T)))
data.frame1 <- cbind(test2, test1)
colnames(data.frame1) <- c("ID","variable1","variable2","x1i","x2i","x3i","x4i","x5i","x6i","x7i","x8i","x9i","x10i","x11i","x12i","x13i","x14i")
data.frame2 <- as.data.frame(c(1:20))
x <- as.data.frame(matrix(NA,nrow = 3,ncol = 20))
rownames(x) <- c("ID","value","variable")
colnames(x) <- c()
View(x)
`%notin%` <- Negate(`%in%`) #got this one from <https://www.marsja.se/how-to-use-in-in-r/>
Read_Info_Loop2 <- function(a,b) {for (k in 1:20) {for (i in 4:17)
{if (a[k,i] %in% b[,1]) {x[k]<-c(a[k,1],(a[k,i]),names(a[i])); b<-subset(b,b %notin% a[k,i]);break}}
}
c<-rbind(x)
bx<-b
colnames(bx) <- c("numbers remaining")
print(c)
print(bx)
}
Read_Info_Loop2(data.frame1, data.frame2)
The only downfall with this solution is the output. It is rather in a weird form. But I don’t mind really. So now we already have two solutions which use different approaches. Very exciting. Regarding the output (see picture below of the output of some of the actual data) of data.frames 3 and 4: The last 7 columns are NAs because this data.frame1_original has just 13 rows (k=13). So for the last 7 iterations (k=14 to k=20), there is no output.
Here is the output of the random data.frame1 as described above. Here, the solution looks rather weird, since for "r" and "t" all entries are already deleted from data.frame2 which returns NAs for these rows. The two numbers, which remain are 18 and 20.
I'm working with multiple big data frames in R and I'm trying to write functions that can modify each of them (given a set of common parameters). One function is giving me trouble (shown below).
RawData <- function(x)
{
for(i in 1:nrow(x))
{
if(grep(".DERIVED", x[i,]) >= 1)
{
x <- x[-i,]
}
}
for(i in 1:ncol(x))
{
if(is.numeric(x[,i]) != TRUE)
{
x <- x[,-i]
}
}
return(x)
}
The objective of this function is twofold: first, to remove any rows that contain a ".DERIVED" string in any one of their cells (using grep), and second, to remove any columns that are non-numeric (using is.numeric). I get an error on the following condition:
if(grep(".DERIVED", x[i,]) >= 1)
The error states the "argument is of zero length", which I believe is usually associated with NULL values in a vector. However, I've used is.null on the entire data frame that is giving me errors, and it confirmed that there are no null values in the DF. I'm sure I'm missing something relatively simple here. Any advice would be greatly appreciated.
If you can use non-base-R functions, this should address your issue. df is the data.frame in question here. It will also be faster than looping over rows (generally not advised if avoidable).
library(dplyr)
library(stringr)
df %>%
filter_all(!str_detect(., '\\.DERIVED')) %>%
select_if(is.numeric)
You can make it a function just as you would anything else:
mattsFunction <- function(dat){
dat %>%
filter_all(!str_detect(., '\\.DERIVED')) %>%
select_if(is.numeric)
}
you should probably give it a better name though
The error is from the line
if(grep(".DERIVED", x[i,]) >= 1)
When grep doesn't find the term ".DERIVED", it returns something of zero length, your inequality doesn't return TRUE or FALSE, but rather returns logical(0). The error is telling you that the if statement cannot evaluate whether logical(0) >= 1
A simple example:
if(grep(".DERIVED", "1234.DERIVEDabcdefg") >= 1) {print("it works")} # Works nicely, since the inequality can be evaluated
if(grep(".DERIVED", "1234abcdefg") > 1) {print("no dice")}
You can replace that line with if(length(grep(".DERIVED", x[i,])) != 0)
There's something else you haven't noticed yet, which is that you're removing rows/columns in a loop. Say you remove the 5th column, the next loop iteration (when i = 6) will be handling what was the 7th row! (this will end in an error along the lines of Error in[.data.frame(x, , i) : undefined columns selected)
I prefer using dplyr, but if you need to use base R functions there are ways to to this without if statements.
Notice that you should consider using the regex version of "\\.DERIVED" and not ".DERIVED" which would mean "any character followed by DERIVED".
I don't have example data or output, so here's my best go...
# Made up data
test <- data.frame(a = c("data","data.DERIVED","data","data","data.DERIVED"),
b = (c(1,2,3,4,5)),
c = c("A","B","C","D","E"),
d = c(2,5,6,8,9),
stringsAsFactors = FALSE)
# Note: The following code assumes that the column class is numeric because the
# example code provided assumed that the column class was numeric. This will not
# detects if the column is full of a string of character values of only numbers.
# Using the base subset command
test2 <- subset(test,
subset = !grepl("\\.DERIVED",test$a),
select = sapply(test,is.numeric))
# > test2
# b d
# 1 1 2
# 3 3 6
# 4 4 8
# Trying to use []. Note: If only 1 column is numeric this will return a vector
# instead of a data.frame
test2 <- test[!grepl("\\.DERIVED",test$a),]
test2 <- test2[,sapply(test,is.numeric)]
# > test2
# b d
# 1 1 2
# 3 3 6
# 4 4 8
Trying to using %in% operator in r to find an equivalent SAS Code as below:
If weather in (2,5) then new_weather=25;
else if weather in (1,3,4,7) then new_weather=14;
else new_weather=weather;
SAS code will produce variable "new_weather" with values 25, 14 and as defined in variable "weather".
R code:
GS <- function(df, col, newcol){
# Pass a dataframe, col name, new column name
df[newcol] = df[col]
df[df[newcol] %in% c(2,5)]= 25
df[df[newcol] %in% c(1,3,4,7)] = 14
return(df)
}
Result: output values of "col" and "newcol" are same, when passing a data frame through a function "GS". Syntax is not picking up the second or more values for a variable "newcol"? Appreciated your time explaining the reason and possible fix.
Is this what you are trying to do?
df <- data.frame(A=seq(1:4), B=seq(1:4))
add_and_adjust <- function(df, copy_column, new_column_name) {
df[new_column_name] <- df[copy_column] # make copy of column
df[,new_column_name] <- ifelse(df[,new_column_name] %in% c(2,5), 25, df[,new_column_name])
df[,new_column_name] <- ifelse(df[,new_column_name] %in% c(1,3,4,7), 14, df[,new_column_name])
return(df)
}
Usage:
add_and_adjust(df, 'B', 'my_new_column')
df[newcol] is a data frame (with one column), df[[newcol]] or df[, newcol] is a vector (just the column). You need to use [[ here.
You also need to be assigning the result to df[[newcol]], not to the whole df. And to be perfectly consistent and safe you should probably test the col values, not the newcol values.
GS <- function(df, col, newcol){
# Pass a dataframe, col name, new column name
df[[newcol]] = df[[col]]
df[[newcol]][df[[col]] %in% c(2,5)] = 25
df[[newcol]][df[[col]] %in% c(1,3,4,7)] = 14
return(df)
}
GS(data.frame(x = 1:7), "x", "new")
# x new
# 1 1 14
# 2 2 25
# 3 3 14
# 4 4 14
# 5 5 25
# 6 6 6
# 7 7 14
#user9231640 before you invest too much time in writing your own function you may want to explore some of the recode functions that already exist in places like car and Hmisc.
Depending on how complex your recoding gets your function will get longer and longer to check various boundary conditions or to change data types.
Just based upon your example you can do this in base R and it will be more self documenting and transparent at one level:
df <- data.frame(A=seq(1:30), B=seq(1:30))
df$my_new_column <- df$B
df$my_new_column <- ifelse(df$my_new_column %in% c(2,5), 25, df$my_new_column)
df$my_new_column <- ifelse(df$my_new_column %in% c(1,3,4,7), 14, df$my_new_column)
A New Year's quandary for the stackoverflow community which has been quite the help by reading posts and answers in the past (this is my first question). I've found a work around, but I'm wondering if other approaches/solutions might be suggested.
I am attempting to remove trailing NA's from a large data.frame, but those NA's are only found in a few of the columns of the data.frame and I would like to retain all columns in the output. Here is a representative data subset.
df=data.frame(var1=rep("A", 8), var2=c("a","b","c","d","e","f","g","h"), var3=c(0,1,NA,2,3,NA,NA,NA), var4=c(0,0,NA,4,5,NA,NA,NA), var5=c(0,0,NA,0,2,4,NA,NA))
Goals of the process:
Trim trailing NAs based on NA presence in var3,var4 and var5
Retain all columns in final output
Only remove trailing NAs (i.e. row 3 remains in record as a placeholder)
Only trim if all columns have an NA (i.e. row 7 and 8, but not row 6)
Based on these goals, the solution should remove the last two rows of df:
df.output = df[-c(7,8),]
The behaviour of na.trim (in the zoo package) is ideal (as it limits removal to those NA's at the end of the data.frame, with sides="right"), and my work-around involved altering the na.trim.default function to include a subset term.
Any suggestions? Many thanks for any help.
EDIT: Just to complete this question, below is the function I created from the na.trim.default code which also works, but as noted, does require loading the zoo package.
na.trim.multiplecols <- function (object, colrange, sides = c("both", "left", "right"), is.na = c("any","all"),...)
{
is.na <- match.arg(is.na)
nisna <- if (is.na == "any" || length(dim(object[,colrange])) < 1) {
complete.cases(object[,colrange])
}
else rowSums(!is.na(object[,colrange])) > 0
idx <- switch(match.arg(sides), left = cumsum(nisna) > 0,
right = rev(cumsum(rev(nisna) > 0) > 0), both = (cumsum(nisna) >
0) & rev(cumsum(rev(nisna)) > 0))
if (length(dim(object)) < 2)
object[idx]
else object[idx, , drop = FALSE]
}
Something based on max(which(!is.na())) will work. We use this to find the largest index of non-missing data from the columns of interest.
Using your df
ind <- max(max(which(!is.na(df$var3))),
max(which(!is.na(df$var4))),
max(which(!is.na(df$var5))))
df[1:ind, ]
var1 var2 var3 var4 var5
1 A a 0 0 0
2 A b 1 0 0
3 A c NA NA NA
4 A d 2 4 0
5 A e 3 5 2
6 A f NA NA 4
Edit: First solution using base rle and apply
t <- rle(apply(as.matrix(df[,3:5]), 1, function(x) all(is.na(x))))
r <- ifelse(t$values[length(t$values)] == TRUE, t$lengths[length(t$lengths)], 0)
head(df, -r)
Second solution using Rle from package IRanges:
require(IRanges)
t <- min(sapply(df[,3:5], function(x) {
o <- Rle(x)
val <- runValue(o)
if (is.na(val[length(val)])) {
len <- runLength(o)
out <- len[length(len)]
} else {
out <- 0
}
}))
head(df, -t)
I have a question regarding data frames in R. I want to take a data.frame, dfy, and find the first occurrence of dfy$workerId in dfx$workers, to create a new dataframe, dfz, a copy of dfx that also contains the first occurance of dfy$workerId in dfx$wokers as dfz$highestRankingGroup. Its a little tricky becuase dfx$workers is a single spaced seperated string. My original plan was to do this in Perl, but I would like to find a way to work in R and avoid having to write out to temp. files.
thank you for your time.
y <- "name,workerId,aptitude
joe,4,34
steve,5,42
jon,7,23
nick,8,122"
x <- "workers,projectScore
1 2 3 8 ,92
1 2 5 9 ,89
3 5 7 ,85
1 8 9 10 ,82
4 5 7 8 ,83
1 3 5 7 8 ,79"
z <- "name,workerId,aptitude,highestRankingGroup
joe,4,0.34,5
steve,5,0.42,2
jon,7,0.23,3
nick,8,0.122,1"
dfy <- read.csv(textConnection(y), header=TRUE, sep=",", stringsAsFactors=FALSE)
dfx <- read.csv(textConnection(x), header=TRUE, sep=",", stringsAsFactors=FALSE)
dfz <- read.csv(textConnection(z), header=TRUE, sep=",", stringsAsFactors=FALSE)
First, add the highestRankingGroup column to your dataset dfx
dfx$highestRankingGroup <- seq(1, length(dfx$projectScore))
Since you have mentioned perl you can do a familar perl thing and simple split the workers column in whitespaces. I combined the splitting with functions from the plyr package which are always nice to work with.
library(plyr)
df.l <- dlply(dfx, "projectScore")
f.reshape <- function(x) {
wrk <- strsplit(x$workers, "\\s", perl = TRUE)
data.frame(worker = wrk[[1]]
, projectScore = x$projectScore
, highestRankingGroup = x$highestRankingGroup
)
}
df.tmp <- ldply(df.l, f.reshape)
df.z1 <- merge(df.tmp, dfy, by.x = "worker", by.y = "workerId")
Now you have to look for the max values in the projectScore column:
df.z2 <- ddply(df.z1, "name", function(x) x[x$projectScore == max(x$projectScore), ])
This produces:
R> df.z2
worker .id projectScore highestRankingGroup name aptitude
1 4 83 83 5 joe 34
2 7 85 85 3 jon 23
3 8 92 92 1 nick 122
4 5 89 89 2 steve 42
R>
You can reshape the df.z2 dataframe according to your personal taste. Simply look at the different steps and the produced objects in order to see at which step different columns, etc get introduced.
Before I start, I recommend that you go with #mropa's answer. This answer is a bit of fun I had messing about with your question. On the plus side, it does involve a bit of fun with function closures ;)
Essentially, I create a function that returns two functions.
updateDFz = function(dfy) {
## Create a default dfz matrix
dfz = dfy
dfz$HRG = 10000 ## Big max value
counter = 0
## Update the dfz matrix after every row
update = function(x) {
counter <<- counter + 1
for(i in seq_along(x)) {
if(is.element(x[i], dfz$workerId))
dfz[dfz$workerId == x[i],]$HRG <<- min(dfz[dfz$workerId == x[i],]$HRG, counter)
}
return(dfz)
}
## Get the dfz matrix
getDFz = function()
return(dfz)
list(getDFz=getDFz, update=update)
}
f = updateDFz(dfy)
lapply(strsplit(dfx$workers, " "), f$update)
f$getDFz()
As I said, a bit of fun ;)
Hopefully someone finds this useful.
# Recieves a data.frame and a search column
# Returns a data.frame of the first occurances of all unique values of the "search" column
getfirsts <- function(data, searchcol){
rows <- as.data.frame(match(unique(data[[searchcol]]), data[[searchcol]]))
firsts = data[rows[[1]],]
return(firsts)
}