Transforming data.frame in R - r

I have the following data frame:
foo <- data.frame( abs( cbind(rnorm(3),rnorm(3, mean=.8),rnorm(3, mean=.9),rnorm(3, mean=1))))
colnames(foo) <- c("w","x","y","z")
rownames(foo) <- c("n","q","r")
foo
# w x y z
# n 1.51550092 1.4337572 1.2791624 1.1771230
# q 0.09977303 0.8173761 1.6123402 0.1510737
# r 1.17083866 1.2469347 0.8712135 0.8488029
What I want to do is to change it into :
newdf
# 1 n w 1.51550092
# 2 q w 0.09977303
# 3 r w 1.17083866
# 4 n x 1.43375725
# 5 q x 0.81737606
# 6 r x 1.24693468
# 7 n y 1.27916241
# 8 q y 1.61234016
# 9 r y 0.87121353
# 10 n z 1.17712302
# 11 q z 0.15107369
# 12 r z 0.84880292
What's the way to do it?

There are several ways to do this. Here's one:
set.seed(1)
foo <- data.frame( abs( cbind(rnorm(3),
rnorm(3, mean=.8),
rnorm(3, mean=.9),
rnorm(3, mean=1))))
colnames(foo) <- c("w","x","y","z")
rownames(foo) <- c("n","q","r")
foo
# w x y z
# n 0.6264538 2.39528080 1.387429 0.6946116
# q 0.1836433 1.12950777 1.638325 2.5117812
# r 0.8356286 0.02046838 1.475781 1.3898432
data.frame(rows = row.names(foo), stack(foo))
# rows values ind
# 1 n 0.62645381 w
# 2 q 0.18364332 w
# 3 r 0.83562861 w
# 4 n 2.39528080 x
# 5 q 1.12950777 x
# 6 r 0.02046838 x
# 7 n 1.38742905 y
# 8 q 1.63832471 y
# 9 r 1.47578135 y
# 10 n 0.69461161 z
# 11 q 2.51178117 z
# 12 r 1.38984324 z

reshape2:::melt() is particularly well suited to this transformation:
library(reshape2)
foo <- cbind(ID=rownames(foo), foo)
melt(foo)
# Using ID as id variables
# ID variable value
# 1 n w 1.7337416
# 2 q w 0.5890877
# 3 r w 0.2245508
# 4 n x 0.5237346
# 5 q x 0.9320455
# 6 r x 0.8156573
# 7 n y 1.9287306
# 8 q y 1.1604229
# 9 r y 1.7631215
# 10 n z 0.3591350
# 11 q z 0.9740170
# 12 r z 0.5621968

Related

How do I make a function that returns the equation from the argument?

For example, I have a function with the arguments "x", "d" and "equation". The "x" argument is my data frame, the "d" argument is the numeric data frame column, and in "equation" argument I wanted to insert the equation "d * 5 ^ 0.02".
As an output, I need to have a new column "V" in the data frame, with the result of the argument equation.
My idea went wrong:
myfunction <- function(x, d, equation, ...){
x$V <- equation
}
myfunction(x=x, d = x$d, equation = c("d"*5^0.02))
I assume you want equation applied to x$d, and the result to be written to x$V.
Providing an "equation" like that is very unusual, and prone to error. Consider creating a function: f <- function(x) x * 5^0.02, and then doing the following.
# dummy data
x <- data.frame(d = 1:10)
# your equation
f <- function(x) x * 5^0.02
g <- function(x, d, f) {
# call function f with column d as its argument
x$V <- f(x[[d]]))
return(x)
}
g(x, "d", f)
d V
1 1 1.032712
2 2 2.065425
3 3 3.098137
4 4 4.130850
5 5 5.163562
Similar to other answers. However, functions that operate in side-effect preclude things like: assign to a new variable (one answer suggests a way to do this), or operate within a pipeline (e.g., %>%).
I suggest not using side-effect (<<- and assign).
myfunction <- function(x, d, equation, ...) {
x$V <- eval(substitute(equation), envir = x)
x
}
x <- data.frame(d = 1:5)
myfunction(x, x$d, d*5^0.02)
# d V
# 1 1 1.032712
# 2 2 2.065425
# 3 3 3.098137
# 4 4 4.130850
# 5 5 5.163562
The original x is unchanged. One advantage to using a functional vice side-effect paradigm is that it will flow better in (say) pipes:
library(dplyr)
x %>%
myfunction(d, d*5^0.02)
# d V
# 1 1 1.032712
# 2 2 2.065425
# 3 3 3.098137
# 4 4 4.130850
# 5 5 5.163562
whereas using side-effect might not be affecting the x that is intended/desired.
x %>%
filter(between(d, 2, 4)) %>%
myfunction(d, d*5^0.02)
# d V
# 1 2 2.065425
# 2 3 3.098137
# 3 4 4.13085
(This does not work when side-effect is used.)
Alternatively, we already have a function in base R for that:
within(x, { V = d*5^0.02 })
# d V
# 1 1 1.032712
# 2 2 2.065425
# 3 3 3.098137
# 4 4 4.130850
# 5 5 5.163562
transform(x, V = d*5^0.02 )
# d V
# 1 1 1.032712
# 2 2 2.065425
# 3 3 3.098137
# 4 4 4.130850
# 5 5 5.163562
Did you have something like this in mind?
myfunction <- function(x, d, equation, ...) x$v <<- eval(substitute(equation))
x <- data.frame(d = 1:5)
myfunction(x=x, d = x$d, equation = d*5^0.02)
x
#> d v
#> 1 1 1.032712
#> 2 2 2.065425
#> 3 3 3.098137
#> 4 4 4.130850
#> 5 5 5.163562
After pondering this a bit, I wonder if you are trying to reinvent within?
within(x, v <- d*5^0.02)
#> d v
#> 1 1 1.032712
#> 2 2 2.065425
#> 3 3 3.098137
#> 4 4 4.130850
#> 5 5 5.163562
Created on 2020-05-27 by the reprex package (v0.3.0)
Here you can return to the global environment direct:
x <- data.frame(c(1:2,5:6),c(7:10))
x
colnames(x) <- c("V1","d")
myfunction <- function(x, d, equation,named.df="NA" ,...){
x$V <- equation
assign(named.df,x, envir=.GlobalEnv)
}
myfunction(x=x, d = x$d, equation = c(x$d*5^0.02),"function result" )

Getting the length of a list

I am attempting to decipher a list res which has structure as per below:
How would I go about converting this to a 21 (row) by 2 (column) dataframe?
I can do it by manually hard-coding the 21:
data.frame(matrix(unlist(res), nrow=21 ))
However I would like to use length(res) which unfortunately returns 1
As it is a list use [[ to index it to get the matrix and then convert to dataframe.
data.frame(res[[1]])
Or use unlist with recursive = FALSE
data.frame(unlist(res[[1]], recursive = FALSE))
Using a reproducble example,
res <- list(matrix(letters,ncol = 2))
data.frame(res[[1]])
# X1 X2
#1 a n
#2 b o
#3 c p
#4 d q
#5 e r
#6 f s
#7 g t
#8 h u
#9 i v
#10 j w
#11 k x
#12 l y
#13 m z
You can also magrittr::extract2
res %>% magrittr::extract2(1)
## A tibble: 21 x 2
# V1 V2
# <chr> <chr>
# 1 O M
# 2 W S
# 3 C Q
# 4 L C
# 5 M K
# 6 R M
# 7 U Q
# 8 I T
# 9 K J
#10 H V
## … with 11 more rows
or use purrr::flatten_dfc
purrr::flatten_dfc(res)
## A tibble: 21 x 2
# V1 V2
# <chr> <chr>
# 1 O M
# 2 W S
# 3 C Q
# 4 L C
# 5 M K
# 6 R M
# 7 U Q
# 8 I T
# 9 K J
#10 H V
## … with 11 more rows
Sample data
set.seed(2018)
res <- list(
as_tibble(matrix(sample(LETTERS, 21 * 2, replace = T), nrow = 21, ncol = 2))
)

Getting the maximum common words in R

I have data of the form:
ID A1 A2 A3 ... A100
1 john max karl ... kevin
2 kevin bosy lary ... rosy
3 karl lary bosy ... hale
.
.
.
10000 isha john lewis ... dave
I want to get one ID for each ID such that both of them have maximum number of common attributes(A1,A2,..A100)
How can I do this in R ?
Edit: Let's call the output a MatchId:
ID MatchId
1 70
2 4000
.
.
10000 3000
I think this gets what you're looking for:
library(dplyr)
# make up some data
set.seed(1492)
rbind_all(lapply(1:15, function(i) {
x <- cbind.data.frame(stringsAsFactors=FALSE, i, t(sample(LETTERS, 10)))
colnames(x) <- c("ID", sprintf("A%d", 1:10))
x
})) -> dat
print(dat)
## Source: local data frame [15 x 11]
##
## ID A1 A2 A3 A4 A5 A6 A7 A8 A9 A10
## 1 1 H F E C B A R J Z N
## 2 2 Q P E M L Z C G V Y
## 3 3 Q J D N B T L K G Z
## 4 4 D Y U F V O I C A W
## 5 5 T Z D I J F R C B S
## 6 6 Q D H U P V O E R N
## 7 7 C L I M E K N S X Z
## 8 8 M J S E N O F Y X I
## 9 9 R H V N M T Q X L S
## 10 10 Q H L Y B W S M P X
## 11 11 M N J K B G S X V R
## 12 12 W X A H Y D N T Q I
## 13 13 K H V J D X Q W A U
## 14 14 M U F H S T W Z O N
## 15 15 G B U Y E L A Q W O
# get commons
rbind_all(lapply(1:15, function(i) {
rbind_all(lapply(setdiff(1:15, i), function(j) {
data.frame(id1=i,
id2=j,
common=length(intersect(c(t(dat[i, 2:11])),
c(t(dat[j, 2:11])))))
}))
})) -> commons
commons %>%
group_by(id1) %>%
top_n(1, common) %>%
filter(row_number()==1) %>%
select(ID=id1, MatchId=id2)
## Source: local data frame [15 x 2]
## Groups: ID
##
## ID MatchId
## 1 1 5
## 2 2 7
## 3 3 5
## 4 4 12
## 5 5 1
## 6 6 9
## 7 7 8
## 8 8 7
## 9 9 10
## 10 10 9
## 11 11 9
## 12 12 13
## 13 13 12
## 14 14 8
## 15 15 2
Using similar data as provided by #hrbrmstr
set.seed(1492)
dat <- do.call(rbind, lapply(1:15, function(i) {
x <- cbind.data.frame(stringsAsFactors=FALSE, i, t(sample(LETTERS, 10)))
colnames(x) <- c("ID", sprintf("A%d", 1:10))
x
}))
You could achieve the same using base R only
Res <- sapply(seq_len(nrow(dat)),
function(x) apply(dat[-1], 1,
function(y) length(intersect(dat[x, -1], y))))
diag(Res) <- -1
cbind(dat[1], MatchId = max.col(Res, ties.method = "first"))
# ID MatchId
# 1 1 5
# 2 2 7
# 3 3 5
# 4 4 12
# 5 5 1
# 6 6 9
# 7 7 8
# 8 8 7
# 9 9 10
# 10 10 9
# 11 11 9
# 12 12 13
# 13 13 12
# 14 14 8
# 15 15 2
If I understand correctly, the requirement is to obtain the maximum number of common attributes for each ID.
Frequency tables can be obtained using table() and recursively in lapply(), assuming that ID column is unique - slight modification is necessary if not (unique(df$ID) rather than df$ID in lapply()). The maximum frequencies can be taken and, if there is a tie, only the first one is chosen. Finally they are combined by do.call().
df <- read.table(header = T, text = "
ID A1 A2 A3 A100
1 john max karl kevin
2 kevin bosy lary rosy
3 karl lary bosy hale
10000 isha john lewis dave")
do.call(rbind, lapply(df$ID, function(x) {
tbl <- table(unlist(df[df$ID == x, 2:ncol(df)]))
data.frame(ID = x, MatchId = tbl[tbl == max(tbl)][1])
}))
# ID MatchId
#john 1 1
#kevin 2 1
#karl 3 1
#isha 10000 1

Delete rows after a certain sequence of values in a certain column

a <- c("A","A","A","B","B","B","C","C","C","C","D","D","D","D","D")
b <- c("x","y","z","x","x","z","y","z","z","z","y","z","z","z","x")
df = data.frame(a,b)
a b
1 A x
2 A y
3 A z
4 B x
5 B x
6 B z
7 C y
8 C z
9 C z
10 C z
11 D y
12 D z
13 D z
14 D z
15 D x
For every group A, B, C, D, I'd like to delete the value z in column b every time the combination y,z appears at the end of the group.
If we have the case of a=="C", where the b-values are y,z,z,z, I'd like to delete all z's. However, in a=="D", nothing has to change as x is the last value.
The results looks like this:
a b
1 A x
2 A y
4 B x
5 B x
6 B z
7 C y
11 D y
12 D z
13 D z
14 D z
15 D x
By grouping in dplyr, I can identify the last occurence of each value in A, so the basic case depictured in a=="A"is not a problem. I have trouble finding a solution for the case of a=="C", where I could have one occurence of y followed by 20 occurences of z.
You can use by and cummin in base R:
df[unlist(by(df$b, interaction(df$a), FUN = function(x) {
tmp <- rev(cummin(rev(x == "z")))
if (tail(x[!tmp], 1) == "y") !tmp else rep(TRUE, length(x))
})), ]
The result:
a b
1 A x
2 A y
4 B x
5 B x
6 B z
7 C y
11 D y
12 D z
13 D z
14 D z
15 D x
Here's a possible data.table solution. Basically, I'm creating an logical index that satisfies 3 conditions at once: being a z, that the first z comes after y and that the last value is z and then I'm just evaluating it.
library(data.table)
setDT(df)[, indx := b == "z" &
max(which(b == "z")) == .N &
ifelse(min(which(b == "z")) == 1L,
TRUE,
b[min(which(b == "z")) - 1L] == "y"),
by = a][!(indx)]
# a b indx
# 1: A x FALSE
# 2: A y FALSE
# 3: B x FALSE
# 4: B x FALSE
# 5: B z FALSE
# 6: C y FALSE
# 7: D y FALSE
# 8: D z FALSE
# 9: D z FALSE
# 10: D z FALSE
# 11: D x FALSE
Here's a base solution:
do.call("rbind", by(df, df$a, FUN = function(x) {
if(x$b[length(x$b)] == "z") {
y <- which(x$b == "y")
if(!length(y)) {
return(x)
}
z <- which(x$b == "z")
if(!length(z)) {
return(x)
}
# check if y isn't immediately before z
if(max(y) - min(z) > 1) {
return(x)
} else {
return(x[-z,])
}
} else {
return(x)
}
}))
And the result:
a b
A.1 A x
A.2 A y
B.4 B x
B.5 B x
B.6 B z
C C y
D.11 D y
D.12 D z
D.13 D z
D.14 D z
D.15 D x
Not so efficient, but works fine:
require(stringr)
df2 <- data.frame(row.names = c("a", "b"))
for(i in levels(factor(df$a))) {
temp <- paste(df$b[df$a == i], collapse = "")
if(str_detect(temp, "yz") & str_detect(temp, "z$")) {
temp <- gsub("z", "", temp)
df2 <- rbind(df2, data.frame(a = rep(i, nchar(temp)), b = substring(temp, seq(1,nchar(temp),1), seq(1,nchar(temp),1))))
} else df2 <- rbind(df2, data.frame(a = rep(i, nchar(temp)), b = substring(temp, seq(1,nchar(temp),1), seq(1,nchar(temp),1))))
}
# a b
# 1 A x
# 2 A y
# 3 B x
# 4 B x
# 5 B z
# 6 C y
# 7 D y
# 8 D z
# 9 D z
# 10 D z
# 11 D x

subset one list on another

I have been looking at mapply documentation but I cannot find an example close enough to help me get started.
I have lists foo and bar:
set.seed(123)
f <- data.frame(y=1:10,x=sample(LETTERS,10))
foo <- list(f,f)
b <- data.frame(x=c("J","U","A"))
ba <- data.frame(x=c("J","W"))
bar <- list(b,ba)
I can subset f with b using:
result <- f[f$x %in% b$x ,]
I want to do this subset but for the whole lists foo and bar i.e. subset foo[[1]] by foo[[1]]["x"] on bar[[1]] and foo[[2]] by foo[[2]]["x"] on bar[[2]] etc...
the result would be:
>foo
[[1]]
y x
3 3 J
4 4 U
6 6 A
[[2]]
y x
3 3 J
5 5 W
Like so...?
mapply(merge,foo,bar,SIMPLIFY = FALSE)
[[1]]
x y
1 A 6
2 J 3
3 U 4
[[2]]
x y
1 J 3
2 W 5

Resources