Combine Strings with Missing Value - r

This is my sample data.
index <- c(1,2,3,4,5,6,7,8,9,10)
a <- c('a','b','c',NA,'D','e',NA,'g','h','i')
data <- data.frame(index,a)
What I would like to is create a new column name where only 'a' and 'b' stay. All others like 'c','d','e'...will be tagged as others, while NA stays as NA.
data$name = ifelse(!grepl('(a|b)',data$a),'others',data$name)
I tried to use the grepl function and it seems it is not working with data with missing values

In base R:
data$res <- as.character(data$a)
data$res[! data$a %in% c("a","b") & !is.na(data$a)] <- "Other"
data
# index a res
# 1 1 a a
# 2 2 b b
# 3 3 c Other
# 4 4 <NA> <NA>
# 5 5 D Other
# 6 6 e Other
# 7 7 <NA> <NA>
# 8 8 g Other
# 9 9 h Other
# 10 10 i Other
Note that the new column is of type character here.

Using dplyr and its recode function, you could do
data %>% mutate(name=recode(a, a="a", b="b", .default="other"))
# index a name
# 1 1 a a
# 2 2 b b
# 3 3 c other
# 4 4 <NA> <NA>
# 5 5 D other
# 6 6 e other
# 7 7 <NA> <NA>
# 8 8 g other
# 9 9 h other
# 10 10 i other
With a more complicated match, you migth use case_when instead
data %>% mutate(name=case_when(
is.na(a) ~ NA_character_,
a %in% c("a","b") ~ as.character(a),
TRUE ~ "other"))

Related

Create a matrix symmetric from long table in r

This is my data
df <- data.frame (Var1 <- c("a", "b", "c","d","e","b","c","d","e","c","d","e","d","e","e"),
Var2 <- c("a","a","a","a","a","b","b","b","b","c","c","c","d","d","e")
pre <- c(1,2,3,4,5,1,6,7,8,1,9,10,1,11,1) )
I would like to build a symmetric matrix with Var1 and Var2 function as rownames and colnames, and the matrix values are the Corresponding number in "pre" in r, like this:
a b c d e
a 1 2 3 4 5
b 2 1 6 7 8
c 3 6 1 9 10
d 4 7 9 1 11
e 5 8 10 11 1
This seems to be an easy problem, but I have googled a lot of posts, but it has not been solved, so I come here to ask, thank you!
Mengying
You can get the data in wide format first.
library(dplyr)
library(tidyr)
mat <- df %>%
pivot_wider(names_from = Var2, values_from = pre, values_fill = 0) %>%
column_to_rownames('Var1') %>%
as.matrix()
mat
# a b c d e
#a 1 0 0 0 0
#b 2 1 0 0 0
#c 3 6 1 0 0
#d 4 7 9 1 0
#e 5 8 10 11 1
Since you have a symmetric matrix you can copy the lower triangular matrix to upper triangle.
mat[upper.tri(mat)] <- t(mat)[upper.tri(mat)]
mat
# a b c d e
#a 1 2 3 4 5
#b 2 1 6 7 8
#c 3 6 1 9 10
#d 4 7 9 1 11
#e 5 8 10 11 1
data
df <- data.frame (Var1 = c("a", "b", "c","d","e","b","c","d","e","c","d","e","d","e","e"),
Var2 = c("a","a","a","a","a","b","b","b","b","c","c","c","d","d","e"),
pre = c(1,2,3,4,5,1,6,7,8,1,9,10,1,11,1) )
Here is an option with igraph package
g <- graph_from_data_frame(df,directed = FALSE)
E(g)$pre <- df$pre
get.adjacency(g,attr = "pre")
which gives
a b c d e
a 1 2 3 4 5
b 2 1 6 7 8
c 3 6 1 9 10
d 4 7 9 1 11
e 5 8 10 11 1
Base R solution (using data provided by Ronak):
# Crosstab:
mdat <- as.data.frame.matrix(xtabs(pre ~ Var1 + Var2, df))
# Reflect on the diag (thanks #Ronak Shah):
mdat[upper.tri(mdat)] <- t(mdat)[upper.tri(mdat)]
As #ThomasIsCoding points as well we can use this one-liner:
xtabs(pre ~ ., unique(rbind(df, cbind(setNames(rev(df[-3]), names(df)[-3]), df[3] ))))
As #thelatemail points out we can also:
xtabs(pre ~ ., unique(data.frame(Map(c, df, df[c(2,1,3)]))))
Here's a base R version:
df <- data.frame (Var1 = c("a", "b", "c","d","e","b","c","d","e","c","d","e","d","e","e"),
Var2 = c("a","a","a","a","a","b","b","b","b","c","c","c","d","d","e"),
pre = c(1,2,3,4,5,1,6,7,8,1,9,10,1,11,1))
# Generate new matrix/data frame
mat2 <- matrix(0, length(unique(df$Var1)), length(unique(df$Var2)))
# Name the columns and rows so we can access values
rownames(mat2) <- unique(df$Var1)
colnames(mat2) <- unique(df$Var2)
# Save values into appropriate places into data frame
mat2[as.matrix(df[, 1:2])] <- as.matrix(df[, 3])
# Using upper triangle trick from #Ronak Shah's answer
mat2[upper.tri(mat)] <- t(mat2)[upper.tri(mat2)]
# See results
mat2
# a b c d e
# a 1 2 3 4 5
# b 2 1 6 7 8
# c 3 6 1 9 10
# d 4 7 9 1 11
# e 5 8 10 11 1

Combining elements of one column into two columns by group in R

Given a two column data.frame with one containing group labels and a second containing integer values ordered from smallest to largest. How can the data be expanded creating pairs of combinations of the integer column?
Not sure the best way to state this. I'm not interested in all possible combinations but instead all unique combinations starting from the lowest value.
In r, the combn function gives the desired output not considering groups, for example:
t(combn(seq(1:4),2))
[,1] [,2]
[1,] 1 2
[2,] 1 3
[3,] 1 4
[4,] 2 3
[5,] 2 4
[6,] 3 4
Since the first values is 1 we get the unique combination of (1,2) and not the additional combination of (2,1) which I don't need. How would one then apply a similar method by groups?
for example given a data.frame
test <- data.frame(Group = rep(c("A","B"),each=4),
Val = c(1,3,6,8,2,4,5,7))
test
Group Val
1 A 1
2 A 3
3 A 6
4 A 8
5 B 2
6 B 4
7 B 5
8 B 7
I was able to come up with this solution that gives the desired output:
test <- data.frame(Group = rep(c("A","B"),each=4),
Val = c(1,3,6,8,2,4,5,7))
j=1
for(i in unique(test$Group)){
if(j==1){
one <- filter(test,i == Group)
two <- data.frame(t(combn(one$Val,2)))
test1 <- data.frame(Group = i,Val1=two$X1,Val2=two$X2)
j=j+1
}else{
one <- filter(test,i == Group)
two <- data.frame(t(combn(one$Val,2)))
test2 <- data.frame(Group = i,Val1=two$X1,Val2=two$X2)
test1 <- rbind(test1,test2)
}
}
test1
Group Val1 Val2
1 A 1 3
2 A 1 6
3 A 1 8
4 A 3 6
5 A 3 8
6 A 6 8
7 B 2 4
8 B 2 5
9 B 2 7
10 B 4 5
11 B 4 7
12 B 5 7
However, this is not elegant and is really slow as the number of groups and length of each group become large. It seems like there should be a more elegant and efficient solution but so far I have not come across anything on SO.
I would appreciate any ideas!
here is a data.table approach
library( data.table )
#make test a data.table
setDT(test)
#split by group
L <- split( test, by = "Group")
#get unique combinations of 2 Vals
L2 <- lapply( L, function(x) {
as.data.table( t( combn( x$Val, m = 2, simplify = TRUE ) ) )
})
#merge them back together
data.table::rbindlist( L2, idcol = "Group" )
# Group V1 V2
# 1: A 1 3
# 2: A 1 6
# 3: A 1 8
# 4: A 3 6
# 5: A 3 8
# 6: A 6 8
# 7: B 2 4
# 8: B 2 5
# 9: B 2 7
#10: B 4 5
#11: B 4 7
#12: B 5 7
You can set simplify = F in combn() and then use unnest_wider() in dplyr.
library(dplyr)
library(tidyr)
test %>%
group_by(Group) %>%
summarise(Val = combn(Val, 2, simplify = F)) %>%
unnest_wider(Val, names_sep = "_")
# Group Val_1 Val_2
# <chr> <dbl> <dbl>
# 1 A 1 3
# 2 A 1 6
# 3 A 1 8
# 4 A 3 6
# 5 A 3 8
# 6 A 6 8
# 7 B 2 4
# 8 B 2 5
# 9 B 2 7
# 10 B 4 5
# 11 B 4 7
# 12 B 5 7
library(tidyverse)
df2 <- split(df$Val, df$Group) %>%
map(~gtools::combinations(n = 4, r = 2, v = .x)) %>%
map(~as_tibble(.x, .name_repair = "unique")) %>%
bind_rows(.id = "Group")

Using mapply to set values based on values in other columns

Based on my previous question, I need help with using the mapply function correctly.
x <- data.frame(a = seq(1,3), b = seq(2,4), c = seq(3,5), d = seq(4,6), b2 = seq(5,7), c2 = seq(6,8), d2 = seq(7,9))
# a b c d b2 c2 d2
# 1 2 3 4 5 6 7
# 2 3 4 5 6 7 8
# 3 4 5 6 7 8 9
My goal is to look at the columns b2 to d2 and, based on their values, change the values in columns b to d respectively. I can do this for a single column quite easily:
x[which(x$b2 == 7),][b] <- NA_real_
My problem is that I want this applied across all my columns but I don't know how to convert this single column formula to work on multiple columns. I tried:
onez <- c(2:4)
twoz <- c(5:7)
f <- function(df, ones, twos) {
df[which(df[,twos] == 7),][ones] <- NA_real_
}
mapply(f, df = x, ones = onez, twos = twoz)
But I'm getting error messages (incorrect dimensions etc) and I see that my function is messy but I lack the knowledge how to fix it.
One way to do it is to tell it to:
Get the subset of the data frame with columns 5, 6, 7: x[5:7]
Check from that subset which values satisfy your condition: x[5:7] == 7
Replace those values with NA: ... <- NA
This gives the following,
x[5:7][x[5:7] == 7] <- NA
x
# a b c d b2 c2 d2
#1 1 2 3 4 5 6 NA
#2 2 3 4 5 6 NA 8
#3 3 4 5 6 NA 8 9
If you want the NAs to be replaced at x[2:4], then you can do,
x[2:4][x[5:7] == 7] <- NA
x
# a b c d b2 c2 d2
#1 1 2 3 NA 5 6 7
#2 2 3 NA 5 6 7 8
#3 3 NA 5 6 7 8 9

Reorder a subset of an R data.frame modifying the row names as well

Given a data.frame:
foo <- data.frame(ID=1:10, x=1:10)
rownames(foo) <- LETTERS[1:10]
I would like to reorder a subset of rows, defined by their row names. However, I would like to swap the row names of foo as well. I can do
sel <- c("D", "H") # rows to reorder
foo[sel,] <- foo[rev(sel),]
sel.wh <- match(sel, rownames(foo))
rownames(foo)[sel.wh] <- rownames(foo)[rev(sel.wh)]
but that is long and complicated. Is there a simpler way?
We can replace the sel values in rownames with the reverse of sel.
x <- rownames(foo)
foo[replace(x, x %in% sel, rev(sel)), ]
# ID x
#A 1 1
#B 2 2
#C 3 3
#H 8 8
#E 5 5
#F 6 6
#G 7 7
#D 4 4
#I 9 9
#J 10 10
Not as concise as ronak-shah's answer, but you could also use order.
# extract row names
temp <- row.names(foo)
# reset of vector
temp[which(temp %in% sel)] <- temp[rev(which(temp %in% sel))]
# reset order of data.frame
foo[order(temp),]
ID x
A 1 1
B 2 2
C 3 3
H 8 8
E 5 5
F 6 6
G 7 7
D 4 4
I 9 9
J 10 10
As noted in the comments, this relies on the row names following a lexicographical order. In instances where this is not true, we can use match.
# set up
set.seed(1234)
foo <- data.frame(ID=1:10, x=1:10)
row.names(foo) <- sample(LETTERS[1:10])
sel <- c("D", "H")
Now, the rownames are
# initial data.frame
foo
ID x
B 1 1
F 2 2
E 3 3
H 4 4
I 5 5
D 6 6
A 7 7
G 8 8
J 9 9
C 10 10
# grab row names
temp <- row.names(foo)
# reorder vector containing row names
temp[which(temp %in% sel)] <- temp[rev(which(temp %in% sel))]
Using, match along with order
foo[order(match(row.names(foo), temp)),]
ID x
B 1 1
F 2 2
E 3 3
D 6 6
I 5 5
H 4 4
A 7 7
G 8 8
J 9 9
C 10 10
your data frame is small so you can duplicate it then change the value of each raw:
footmp<-data.frame(foo)
foo[4,]<-footemp[8,]
foot{8,]<-footemp[4,]
Bob

R - Subset dataframe to include only subjects with more than 1 record

I'd like to subset a dataframe to include all records for subjects that have >1 record, and exclude those subjects with only 1 record.
Let's take the following dataframe;
mydata <- data.frame(subject_id = factor(c(1,2,3,4,4,5,5,6,6,7,8,9,9,9,10)),
variable = rnorm(15))
The code below gives me the subjects with >1 record using duplicated();
duplicates <- mydata[duplicated(mydata$subject_id),]$subject_id
But I want to retain in my subset all records for each subject with >1 record, so I tried;
mydata[mydata$subject_id==as.factor(duplicates),]
Which does not return the result I'm expecting.
Any ideas?
A data.table solution
set.seed(20)
subject_id <- as.factor(c(1,2,3,4,4,5,5,6,6,7,8,9,9,9,10))
variable <- rnorm(15)
mydata<-as.data.frame(cbind(subject_id, variable))
library(data.table)
setDT(mydata)[, .SD[.N > 1], by = subject_id] # #Thanks David.
# subject_id variable
# 1: 4 -1.3325937
# 2: 4 -0.4465668
# 3: 5 0.5696061
# 4: 5 -2.8897176
# 5: 6 -0.8690183
# 6: 6 -0.4617027
# 7: 9 -0.1503822
# 8: 9 -0.6281268
# 9: 9 1.3232209
A simple alternative is to use dplyr:
library(dplyr)
dfr <- data.frame(a=sample(1:2,10,rep=T), b=sample(1:5,10, rep=T))
dfr <- group_by(dfr, b)
dfr
# Source: local data frame [10 x 2]
# Groups: b
#
# a b
# 1 2 4
# 2 2 2
# 3 2 5
# 4 2 1
# 5 1 2
# 6 1 3
# 7 2 1
# 8 2 4
# 9 1 4
# 10 2 4
filter(dfr, n() > 1)
# Source: local data frame [8 x 2]
# Groups: b
#
# a b
# 1 2 4
# 2 2 2
# 3 2 1
# 4 1 2
# 5 2 1
# 6 2 4
# 7 1 4
# 8 2 4
Here you go (I changed your variable to var <- rnorm(15):
set.seed(11)
subject_id<-as.factor(c(1,2,3,4,4,5,5,6,6,7,8,9,9,9,10))
var<-rnorm(15)
mydata<-as.data.frame(cbind(subject_id,var))
x1 <- c(names(table(mydata$subject_id)[table(mydata$subject_id) > 1]))
x2 <- which(mydata$subject_id %in% x1)
mydata[x2,]
subject_id var
4 4 0.3951076
5 4 -2.4129058
6 5 -1.3309979
7 5 -1.7354382
8 6 0.4020871
9 6 0.4628287
12 9 -2.1744466
13 9 0.4857337
14 9 1.0245632
Try:
> mydata[mydata$subject_id %in% mydata[duplicated(mydata$subject_id),]$subject_id,]
subject_id variable
4 4 -1.3325937
5 4 -0.4465668
6 5 0.5696061
7 5 -2.8897176
8 6 -0.8690183
9 6 -0.4617027
12 9 -0.1503822
13 9 -0.6281268
14 9 1.3232209
I had to edit your data frame a little bit:
set.seed(20)
subject_id <- as.factor(c(1,2,3,4,4,5,5,6,6,7,8,9,9,9,10))
variable <- rnorm(15)
mydata<-as.data.frame(cbind(subject_id, variable))
Now to get all the rows for subjects that appear more than once:
mydata[duplicated(mydata$subject_id)
| duplicated(mydata$subject_id, fromLast = TRUE), ]
# subject_id variable
# 4 4 -1.3325937
# 5 4 -0.4465668
# 6 5 0.5696061
# 7 5 -2.8897176
# 8 6 -0.8690183
# 9 6 -0.4617027
# 12 9 -0.1503822
# 13 9 -0.6281268
# 14 9 1.3232209
Edit: this would also work, using your duplicates vector:
mydata[mydata$subject_id %in% duplicates, ]

Resources