Create long character strings in name levels - r

I'd like to do a modification in levels names by a rule, but I have a problem below:
my data; intially df variable was class matrix I changed it
df <- data.frame(x = c("P27C", "P31B", "P12E", "P3E", "P7A", "P7D", "P2A", "P7D",
"P34", "P10C"),
y = rnorm(10), stringsAsFactors = F)
s<-c("P27CvsP31B","P27CvsP3C","P27CvsP3E","P27CvsP6B","P27CvsP7A","P27CvsP7C",
"P27DvsP27E","P27DvsP2B","P27DvsP31A","P27DvsP31B","P27DvsP3D","P27DvsP7D",
"P27EvsP2A","P27EvsP2B","P27EvsP2E","P27EvsP2F","P27EvsP2G","P27EvsP34",
"P7AvsP7H","P7BvsP7D","P7CvsP7G","P7DvsP7E","P7DvsP7F","P7DvsP7G","P7DvsP7H")
df
df$z <- lapply(df$x, grep, s, value = T)
# gives you the matches but empty slots for a missing value like "P12E"
df
for (r in 1:nrow(df)) {
if (length(df$z[[r]]) == 0) {
df$z[[r]] <- df$x[[r]]
}
else {
df$z[[r]] <- df$z[[r]]
}
}
# restores the original name of unmatched values
df$z
#Rename but in list format!!!
and my desired output is:
x y z
1 P27C 2.22354499 "P27CvsP31B, P27CvsP3C, P27CvsP3E, P27CvsP6B, P27CvsP7A, P27CvsP7C"
2 P31B 0.89197064 "P27CvsP31B, P27DvsP31B"
3 P12E -0.02313754 "P12E"
4 P3E 0.69916446 "P27CvsP3E"
5 P7A -0.44895512 "P27CvsP7A, P7AvsP7H"
6 P7D 1.77619979 "P27DvsP7D, P7BvsP7D, P7DvsP7E, P7DvsP7F, P7DvsP7G, P7DvsP7H"
7 P2A -0.18261732 "P27EvsP2A"
8 P7D 0.12025524 "P27DvsP7D, P7BvsP7D, P7DvsP7E, P7DvsP7F, P7DvsP7G, P7DvsP7H"
9 P34 -0.13434265 "P27EvsP34"
10 P10C 0.19971201 "P10C"
Thanks

Looks a bit ugly with the nested sapply. It loops over x column of your df and matches all the entries with your vector s creating a list of the matched results. The second sapply loops over that list and pastes all the entries. If there is no match, then it returns an empty cell which we handle by substituting the df$x entry at its place.
df$z <- sapply(sapply(df$x, function(i) s[grepl(i, s)]), paste, collapse = ',')
df$z[df$z == ''] <- df$x[df$z == '']
df
# x y z
#1 P27C -0.95290496 P27CvsP31B,P27CvsP3C,P27CvsP3E,P27CvsP6B,P27CvsP7A,P27CvsP7C
#2 P31B 1.62237939 P27CvsP31B,P27DvsP31B
#3 P12E 2.60014202 P12E
#4 P3E 0.13964851 P27CvsP3E
#5 P7A -1.35071967 P27CvsP7A,P7AvsP7H
#6 P7D 0.79893102 P27DvsP7D,P7BvsP7D,P7DvsP7E,P7DvsP7F,P7DvsP7G,P7DvsP7H
#7 P2A -1.55499584 P27EvsP2A
#8 P7D 0.46372006 P27DvsP7D,P7BvsP7D,P7DvsP7E,P7DvsP7F,P7DvsP7G,P7DvsP7H
#9 P34 0.05242956 P27EvsP34
#10 P10C -0.20203180 P10C
EDIT
Based on #akrun's suggestion, an option with data.table would be,
library(data.table)
setDT(df)[, z := unlist(lapply(x, function(y) toString(grep(y, s, value = TRUE))))][z=="", z := x][]

Related

Combine two strings with commas in R

I haven't been able to find an answer to this, but I am guessing this is because I am not phrasing my question properly.
I want to combine two strings containing several comma-separated values into one string, alternating the inputs from each original string.
x <- '1,2'
y <- 'R,L'
# fictitious function
z <- combineSomehow(x,y)
z = '1R, 2L'
EDIT : Adding dataframe to better describe my issue. I would like to be able to accomplish the above, but within a mutate ideally.
df <- data.frame(
x = c('1','2','1,1','2','1'),
y = c('R','L','R,L','L','R'),
desired_result = c('1R','2L','1R,1L','2L','1R')
)
df:
x y desired_result
1 1 R 1R
2 2 L 2L
3 1,1 R,L 1R,1L
4 2 L 2L
5 1 R 1R
Final Edit/Answer: Based on #akrun's comment/response below and after removing the error originally in df, this ended up being the tidyverse answer:
mutate(desired_result = map2(.x=strsplit(x,','),.y=strsplit(y,','),
~ str_c(.x,.y, collapse=',')))
It can be done with strsplit and paste
combineSomehow <- function(x, y) {
do.call(paste0, c(strsplit(c(x,y),","), collapse=", "))
}
combineSomehow(x,y)
#[1] "1R, 2L"
Without modifying the function, we can Vectorize it to apply on multiple elements
df$desired_result2 <- Vectorize(combineSomehow)(df$x, df$y)

order a list of stings in r

The data I have include two variables: id and income (a list of characters)
id <- seq(1,6)
income <- c("2322;5125",
"0110;2012",
"2212;0912",
"1012;0145",
"1545;1102",
"1010;2028")
df <- data.frame(id, income)
df$income <- as.character(df$income)
I need to add a third column income_order which includes the ordered values of column income. The final output would look like
NOTE: I would still need to keep the leading zeros
We could split the string on ";", sort and paste the string back.
df$income_order <- sapply(strsplit(df$income, ";"), function(x)
paste(sort(x), collapse = ";"))
df
# id income income_order
#1 1 2322;5125 2322;5125
#2 2 0110;2012 0110;2012
#3 3 2212;0912 0912;2212
#4 4 1012;0145 0145;1012
#5 5 1545;1102 1102;1545
#6 6 1010;2028 1010;2028
We can use gsubfn
library(gsubfn)
df$income_order <- gsubfn("(\\d+);(\\d+)", ~ paste(sort(c(x, y)), collapse=";"), df$income)
df$income_order
#[1] "2322;5125" "0110;2012" "0912;2212" "0145;1012" "1102;1545" "1010;2028"

Creating function to read data set and columns and displyaing nrow

I am struggling a bit with a probably fairly simple task. I wanted to create a function that has arguments of dataframe(df), column names of dataframe(T and R), value of the selected column of dataframe(a and b). I know that the function reads the dataframe. but , I don't know how the columns are selected. I'm getting an error.
fun <- function(df,T,a,R,b)
{
col <- ds[c("x","y")]
omit <- na.omit(col)
data1 <- omit[omit$x == 'a',]
data2 <- omit[omit$x == 'b',]
nrow(data2)/nrow(data1)
}
fun(jugs,Place,UK,Price,10)
I'm new to r language. So, please help me.
There are several errors you're making.
col <- ds[c("x","y")]
What are x and y? Presumably they're arguments that you're passing, but you specify T and R in your function, not x and y.
data1 <- omit[omit$x == 'a',]
data2 <- omit[omit$x == 'b',]
Again, presumably, you want a and b to be arguments you passed to the function, but you specified 'a' and 'b' which are specific, not general arguments. Also, I assume that second "omit$x" should be "omit$y" (or vice versa). And actually, since you just made this into a new data frame with two columns, you can just use the column index.
nrow(data2)/nrow(data1)
You should print this line, or return it. Either one should suffice.
fun(jugs,Place,UK,Price,10)
Finally, you should use quotes on Place, UK, and Price, at least the way I've done it.
fun <- function(df, col1, val1, col2, val2){
new_cols <- df[,c(col1, col2)]
omit <- na.omit(new_cols)
data1 <- omit[omit[,1] == val1,]
data2 <- omit[omit[,2] == val2,]
print(nrow(data2)/nrow(data1))
}
fun(jugs, "Place", "UK", "Price", 10)
And if I understand what you're trying to do, it may be easier to avoid creating multiple dataframes that you don't need and just use counts instead.
fun <- function(df, col1, val1, col2, val2){
new_cols <- df[,c(col1, col2)]
omit <- na.omit(new_cols)
n1 <- sum(omit[,1] == val1)
n2 <- sum(omit[,2] == val2)
print(n2/n1)
}
fun(jugs, "Place", "UK", "Price", 10)
I would write this function as follows:
fun <- function(df,T,a,R,b) {
data <- na.omit(df[c(T,R)]);
sum(data[[R]]==b)/sum(data[[T]]==a);
};
As you can see, you can combine the first two lines into one, because in your code col was not reused anywhere. Secondly, since you only care about the number of rows of the two subsets of the intermediate data.frame, you don't actually need to construct those two data.frames; instead, you can just compute the logical vectors that result from the two comparisons, and then call sum() on those logical vectors, which naturally treats FALSE as 0 and TRUE as 1.
Demo:
fun <- function(df,T,a,R,b) { data <- na.omit(df[c(T,R)]); sum(data[[R]]==b)/sum(data[[T]]==a); };
df <- data.frame(place=c(rep(c('p1','p2'),each=4),NA,NA), price=c(10,10,20,NA,20,20,20,NA,20,20), stringsAsFactors=F );
df;
## place price
## 1 p1 10
## 2 p1 10
## 3 p1 20
## 4 p1 NA
## 5 p2 20
## 6 p2 20
## 7 p2 20
## 8 p2 NA
## 9 <NA> 20
## 10 <NA> 20
fun(df,'place','p1','price',20);
## [1] 1.333333

exclusively mark y's variables when merging in loop

As a continuation of this question I am now looking for a way to mark only non-unique variables from the y-data-frame when I merge.
The default for suffixes is to look for a vector that has the length of two.
Say I have this list,
a <- list(A = data.frame(id = c(01, 02), a=runif(2), b=runif(2)),
B = data.frame(id = c(01, 02), b=runif(2), c=runif(2)),
C = data.frame(id = c(01, 02), c=runif(2), d=runif(2)))
a
$A
id a b
1 1 0.6922513 0.9966336
2 2 0.9216164 0.8256744
$B
id b c
1 1 0.2242940 0.7058331
2 2 0.4474754 0.9228213
$C
id c d
1 1 0.969796 0.1761250
2 2 0.633697 0.6618188
then I make some customization where I merge some of the data frames together one by oen, here exemplified by taking out one data frame,
df <- a[[1]]
a <- a[setdiff(names(a), names(a[1]))]
then I merge the list in this way,
for(i in seq_along(a)) {
v <- a[[i]] # extract value
ns <- names(a)
n <- ns[[i]] # extract name
df <-merge(df, v, by.x="id", by.y="id", all.x=T,
suffixes=paste(".", n, sep = ""))
}
df
id a b.B bNA c.C cNA d
1 1 0.6922513 0.9966336 0.2242940 0.7058331 0.969796 0.1761250
2 2 0.9216164 0.8256744 0.4474754 0.9228213 0.633697 0.6618188
The issue is, as shown above, that R adds a mark to both non-unique variables, but as I only supplied one name n I get an NA on the 'other' variable.' In the example above I get an .B-suffix on the variable from the A-data-frame.
Is there a way I can either add the correct data frame name to both variables or (preferred) exclusively mark y's variables when merging?
This was a fun little puzzle. I cheated and "borrowed" heavily from Hadley's merge_recurse function in the reshape package:
merge_recurse1 <- function (dfs, ...)
{
n <- length(dfs)
if (!is.null(names(dfs))){
}
if (length(dfs) == 2) {
merge(dfs[[1]], dfs[[2]],all = TRUE,sort = FALSE,
suffixes = c('',names(dfs)[2]), ...)
}
else {
merge(Recall(dfs[-n],...), dfs[[n]],all = TRUE,sort = FALSE,
suffixes = c('',names(dfs)[n]),...)
}
}
> merge_recurse1(a,by = "id")
id a b bB c cC d
1 1 0.2536158 0.6083147 0.3060572 0.1428531 0.6403072 0.4621454
2 2 0.9839910 0.7256161 0.2203161 0.6653415 0.1496376 0.8767888
In addition to the suffix changes I made, I found I need to add a ... argument to Recall just to get merge_recurse to work the way I thought it should. Not sure if that's a bug or if I'm just misunderstanding the function.
Sorry... It took me a little while to understand your problem. But, you're... like... 99% there.
Change the argument:
suffixes = paste(".", n, sep = "")
to:
suffixes = c("", paste(".", n, sep = ""))
And you should be OK. By doing this, I got a df that looks like this:
> df
id a b b.B c c.C d
1 1 -0.6039805 0.08297807 0.06426459 2.787147 -0.9566280 -0.36054991
2 2 -0.1694382 -0.95296450 0.37144139 -1.346691 0.7072892 0.09239593
By the way, instead of all of this, did you try some of the other recommendations from earlier Stackoverflow posts? Somewhere I remember seeing something using Reduce which got me to this partial solution (with your original "a" data):
Reduce(function(x, y) merge(x, y, by="id", all=TRUE, suffixes=c("", "_2")),
a, accumulate=FALSE)
which gives you output like:
id a b b_2 c c_2 d
1 1 -0.6039805 0.08297807 0.06426459 2.787147 -0.9566280 -0.36054991
2 2 -0.1694382 -0.95296450 0.37144139 -1.346691 0.7072892 0.09239593
Are either of these more useful or closer to what you are looking for?

using R to compare data frames to find first occurrence to df1$columnA in df2$columnB when df2$columnB is a single space separated list

I have a question regarding data frames in R. I want to take a data.frame, dfy, and find the first occurrence of dfy$workerId in dfx$workers, to create a new dataframe, dfz, a copy of dfx that also contains the first occurance of dfy$workerId in dfx$wokers as dfz$highestRankingGroup. Its a little tricky becuase dfx$workers is a single spaced seperated string. My original plan was to do this in Perl, but I would like to find a way to work in R and avoid having to write out to temp. files.
thank you for your time.
y <- "name,workerId,aptitude
joe,4,34
steve,5,42
jon,7,23
nick,8,122"
x <- "workers,projectScore
1 2 3 8 ,92
1 2 5 9 ,89
3 5 7 ,85
1 8 9 10 ,82
4 5 7 8 ,83
1 3 5 7 8 ,79"
z <- "name,workerId,aptitude,highestRankingGroup
joe,4,0.34,5
steve,5,0.42,2
jon,7,0.23,3
nick,8,0.122,1"
dfy <- read.csv(textConnection(y), header=TRUE, sep=",", stringsAsFactors=FALSE)
dfx <- read.csv(textConnection(x), header=TRUE, sep=",", stringsAsFactors=FALSE)
dfz <- read.csv(textConnection(z), header=TRUE, sep=",", stringsAsFactors=FALSE)
First, add the highestRankingGroup column to your dataset dfx
dfx$highestRankingGroup <- seq(1, length(dfx$projectScore))
Since you have mentioned perl you can do a familar perl thing and simple split the workers column in whitespaces. I combined the splitting with functions from the plyr package which are always nice to work with.
library(plyr)
df.l <- dlply(dfx, "projectScore")
f.reshape <- function(x) {
wrk <- strsplit(x$workers, "\\s", perl = TRUE)
data.frame(worker = wrk[[1]]
, projectScore = x$projectScore
, highestRankingGroup = x$highestRankingGroup
)
}
df.tmp <- ldply(df.l, f.reshape)
df.z1 <- merge(df.tmp, dfy, by.x = "worker", by.y = "workerId")
Now you have to look for the max values in the projectScore column:
df.z2 <- ddply(df.z1, "name", function(x) x[x$projectScore == max(x$projectScore), ])
This produces:
R> df.z2
worker .id projectScore highestRankingGroup name aptitude
1 4 83 83 5 joe 34
2 7 85 85 3 jon 23
3 8 92 92 1 nick 122
4 5 89 89 2 steve 42
R>
You can reshape the df.z2 dataframe according to your personal taste. Simply look at the different steps and the produced objects in order to see at which step different columns, etc get introduced.
Before I start, I recommend that you go with #mropa's answer. This answer is a bit of fun I had messing about with your question. On the plus side, it does involve a bit of fun with function closures ;)
Essentially, I create a function that returns two functions.
updateDFz = function(dfy) {
## Create a default dfz matrix
dfz = dfy
dfz$HRG = 10000 ## Big max value
counter = 0
## Update the dfz matrix after every row
update = function(x) {
counter <<- counter + 1
for(i in seq_along(x)) {
if(is.element(x[i], dfz$workerId))
dfz[dfz$workerId == x[i],]$HRG <<- min(dfz[dfz$workerId == x[i],]$HRG, counter)
}
return(dfz)
}
## Get the dfz matrix
getDFz = function()
return(dfz)
list(getDFz=getDFz, update=update)
}
f = updateDFz(dfy)
lapply(strsplit(dfx$workers, " "), f$update)
f$getDFz()
As I said, a bit of fun ;)
Hopefully someone finds this useful.
# Recieves a data.frame and a search column
# Returns a data.frame of the first occurances of all unique values of the "search" column
getfirsts <- function(data, searchcol){
rows <- as.data.frame(match(unique(data[[searchcol]]), data[[searchcol]]))
firsts = data[rows[[1]],]
return(firsts)
}

Resources