Ordering columns of a data frame - r

I am interested to order columns of the data frame give below
structure(list(DETECTION = c(0.000219, 0.000673, 0.000322, 0.602006,
0.000468, 0.204022, 0.000491, 0.003067), VALUE = structure(1:8, .Label = c("10071_s_at",
"1053_at", "117_at", "1255_g_at", "1294_at", "1320_at", "1405_i_at",
"14312_at"), class = "factor")), .Names = c("DETECTION", "VALUE"
), class = "data.frame", row.names = c(NA, -8L))
I want numeric column (DETECTION) at the second.
I tried something here
d1 <- data[1, , drop = FALSE]
nums <- d1[, nn <- sapply(d1, is.numeric)]
ch <- d1[, !nn, drop = FALSE]
id <- names(ch[, grepl('_at$', as.character(unlist(ch))), drop = FALSE])
p <- names(nums)
d <- data[,c(id,p)]
However names(nums) returns NULL . What is going wrong here.

dt <- as.data.table(data)
From R help : " When it's required to reorder the columns of a data.table, the idiomatic way is to use setcolorder(x, neworder), instead of doing x <- x[, neworder, with=FALSE]. This is because the latter makes an entire copy of the data.table, which maybe unnecessary in most situations."
setcolorder(dt,c("VALUE","DETECTION"))

names(nums) is NULL because the dimensions were dropped. You can add the argument drop to keep the dimensions as they are:
names(nums)
#NULL
nums <- d1[, nn <- sapply(d1, is.numeric), drop=FALSE]
names(nums)
#[1] "DETECTION"

Related

Removing the special symbols in data.frame column values

I have two data frame each with a column Name
df1:
name
#one2
!iftwo
there_2_go
come&go
df1 = structure(list(name = c("#one2", "!iftwo", "there_2_go", "come&go")),.Names = c("name"), row.names = c(NA, -4L), class = "data.frame")
df2:
name
One2
IfTwo#
there-2-go
come.go
df2 = structure(list(name = c("One2", "IfTwo#", "there-2-go", "come.go")),.Names = c("name"), row.names = c(NA, -4L), class = "data.frame")
Now to compare the two data frames for inequality is cumbersome because of special symbols using %in%. To remove the special symbols using stringR can be useful. But how exactly we can use stringR functions with %in% and display the mismatch between them
have already done the mutate() to convert all in lowercasestoLower()as follows
df1<-mutate(df1,name=tolower(df1$name))
df2<-mutate(df2,name=tolower(df2$name))
Current output of comparison:
df2[!(df2 %in% df1),]
[1] "one2" "iftwo#" "there-2-go" "come.go"
Expected output as essentially the contents are same but with special symbols:
df2[!(df2 %in% df1),]
character(0)
Question : How do we ignore the symbols in the contents of the Frame
Here it is in a function,
f1 <- function(df1, df2){
i1 <- tolower(gsub('[[:punct:]]', '', df1$name))
i2 <- tolower(gsub('[[:punct:]]', '', df2$name))
d1 <- sapply(i1, function(i) grepl(paste(i2, collapse = '|'), i))
return(!d1)
}
f1(df, df2)
# one2 iftwo there2go comego
# FALSE FALSE FALSE FALSE
#or use it for indexing,
df2[f1(df, df2),]
#character(0)

Merging Long-Form Data that has NAs with Wide-Form Complete Data To Override NAs

So I have three data sets that I need to merge. These contain school data and read/math scores for grades 4 and 5. One of them is a long form data set that has a lot of missingness in some variables (yes, I do need the data in long form) and the other two have the full missing data in wide form. All of these data frames contain a column that has an unique ID number for each individual in the database.
Here is a full reproducible example that generates a small example of the types of data.frames I am working with... The three data frames that I need to use are the following: school_lf, school4 and school5. school_lf has the long form data with NAs and school4 and school5 are the dfs I need to use to populate the NA's in this long form data (by id and grade)
set.seed(890)
school <- NULL
school$id <-sample(102938:999999, 100)
school$selected <-sample(0:1, 100, replace = T)
school$math4 <- sample(400:500, 100)
school$math5 <- sample(400:500, 100)
school$read4 <- sample(400:500, 100)
school$read5 <- sample(400:500, 100)
school <- as.data.frame(school)
# Delete observations at random from the school df
indm4 <- which(school$math4 %in% sample(school$math4, 25))
school$math4[indm4] <- NA
indm5 <- which(school$math5 %in% sample(school$math5, 50))
school$math5[indm5] <- NA
indr4 <- which(school$read4 %in% sample(school$read4, 70))
school$read4[indr4] <- NA
indr5 <- which(school$read5 %in% sample(school$read5, 81))
school$read5[indr5] <- NA
# Separate Read and Math
read <- as.data.frame(subset(school, select = -c(math4, math5)))
math <- as.data.frame(subset(school, select = -c(read4, read5)))
# Now turn this into long form data...
clr <- melt(read, id.vars = c("id", "selected"), variable.name = "variable", value.name = "readscore")
clm <- melt(math, id.vars = c("id", "selected"), value.name = "mathscore")
# Clean up the grades for each of these...
clr$grade <- ifelse(clr$variable == "read4", 4,
ifelse(clr$variable == "read5", 5, NA))
clm$grade <- ifelse(clm$variable == "math4", 4,
ifelse(clm$variable == "math5", 5, NA))
# Put all these in one df
school_lf <-cbind(clm, clr$readscore)
school_lf$readscore <- school_lf$`clr$readscore` # renames
school_lf$`clr$readscore` <- NULL # deletes
school_lf$variable <- NULL # deletes
###############
# Generate the 2 data frames with IDs that have the full data
set.seed(890)
school4 <- NULL
school4$id <-sample(102938:999999, 100)
school4$selected <-sample(0:1, 100, replace = T)
school4$math4 <- sample(400:500, 100)
school4$read4 <- sample(400:500, 100)
school4$grade <- 4
school4 <- as.data.frame(school4)
set.seed(890)
school5 <- NULL
school5$id <-sample(102938:999999, 100)
school5$selected <-sample(0:1, 100, replace = T)
school5$math5 <- sample(400:500, 100)
school5$read5 <- sample(400:500, 100)
school5$grade <- 5
school5 <- as.data.frame(school5)
I need to merge the wide-form data into the long-form data to replace the NAs with the actual values. I have tried the code below, but it introduces several columns instead of merging the read scores and the math scores where there's NA's. I simply need one column with the read scores and one with the math scores, instead of six separate columns (read.x, read.y, math.x, math.y, mathscore and readscore).
sch <- merge(school_lf, school4, by = c("id", "grade", "selected"), all = T)
sch <- merge(sch, school5, by = c("id", "grade", "selected"), all = T)
Any help is highly appreciated! I've been trying to solve this for hours now and haven't made any progress (so figured I'd ask here)
You can use the coalesce function from dplyr. If a value in the first vector is NA, it will see if the value at the same position in the second vector is not NA and select it. If again NA, it goes to the third.
library(dplyr)
sch %>% mutate(mathscore = coalesce(mathscore, math4, math5)) %>%
mutate(readscore = coalesce(readscore, read4, read5)) %>%
select(id:readscore)
EDIT: I just tried to do this approach on my actual data and it does not work because the replacement data also has some NAs and, as a result, the dfs I try to do coalesce with have differing number of rows... Back to square one.
I was able to figure this out with the following code (albeit it's not the most elegant or straight-forward ,and #Edwin's response helped point me in the right direction. Any suggestions on how to make this code more elegant and efficient are more than welcome!
# Idea: put both in long form and stack on top of one another... then merge like that!
sch4r <- as.data.frame(subset(school4, select = -c(mathscore)))
sch4m <- as.data.frame(subset(school4, select = -c(readscore)))
sch5r <- as.data.frame(subset(school5, select = -c(mathscore)))
sch5m <- as.data.frame(subset(school5, select = -c(readscore)))
# Put these in LF
sch4r_lf <- melt(sch4r, id.vars = c("id", "selected", "grade"), value.name = "readscore")
sch4m_lf <- melt(sch4m, id.vars = c("id", "selected", "grade"), value.name = "mathscore")
sch5r_lf <- melt(sch5r, id.vars = c("id", "selected", "grade"), value.name = "readscore")
sch5m_lf <- melt(sch5m, id.vars = c("id", "selected", "grade"), value.name = "mathscore")
# Combine in one DF
sch_full_4 <-cbind(sch4r_lf, sch4m_lf$mathscore)
sch_full_4$mathscore <- sch_full_4$`sch4m_lf$mathscore`
sch_full_4$`sch4m_lf$mathscore` <- NULL # deletes
sch_full_4$variable <- NULL
sch_full_5 <- cbind(sch5r_lf, sch5m$mathscore)
sch_full_5$mathscore <- sch_full_5$`sch5m$mathscore`
sch_full_5$`sch5m$mathscore` <- NULL
sch_full_5$variable <- NULL
# Stack together
sch_full <- rbind(sch_full_4,sch_full_5)
sch_full$selected <- NULL # delete this column...
# MERGE together
final_school_math <- mutate(school_lf, mathscore = coalesce(school_lf$mathscore, sch_full$mathscore))
final_school_read <- mutate(school_lf, readscore = coalesce(school_lf$readscore, sch_full$readscore))
final_df <- cbind(final_school_math, final_school_read$readscore)
final_df$readscore <- final_df$`final_school_read$readscore`
final_df$`final_school_read$readscore` <- NULL

What is the most "data.table" way to assign values to a column of a data.table from a different data.table

I was trying to assign the values from column "True_value" in data.table b into the column of the same name in data.table a. I finally did get something to work, but I'm not sure 1)why it worked and 2)whether there's a better way. Any insight would be appreciated. And yes, with respect, I have read up on data.table a little.
require(data.table)
a=as.matrix(c("a","b","c"))
a=cbind(a,c("yes", "no", "maybe"))
a=cbind(a,c("NA","NA","NA"))
rownames(a)=c("one", "two","three")
colnames(a)=c("letter", "status", "True_value")
a=as.data.table(a)
b=as.data.table(c(3,13,42))
colnames(b)=c("True_value")
a[,True_value]
b[,True_value]
##this doesn't work
a[,True_value] = b[,True_value]
##this doesn't assign the values, but rather assigns the string, "True_value"
a[,"True_value"] = b[,"True_value"]
##this doesn't work
a[,.(True_value)] = b[,.(True_value)]
##none of these work
a[,.(True_value)] = unlist(b[,True_value])
a[,True_value] = unlist(b[,True_value])
##and yet this one works. Why does this work, and is there a better way to do this?
a[,"True_value"] = unlist(b[,True_value])
Your example is a really amazing way of creating a data.table :)
require(data.table)
a <- data.table(letter = c("a","b","c"),
status = c("yes", "no", "maybe"),
True_value = NA_real_)
b <- data.table(True_value = c(3, 13, 42))
# I am not sure if this is the most "data.table" way of doing this,
# but it is readable at least:
a[, True_value := b[, True_value]]
If you want to match any number of columns in any order:
require(data.table)
require(dplyr)
a <- data.table(V1 = c("a","b","c"),
V2 = c("yes", "no", "maybe"),
V3 = NA,
V4 = NA)
b <- data.table(V4 = c(1, 2, 3),
ignore = c(99),
V3 = c(3, 13, 42))
# Get positions of matching columns in a,
# thanks to match can be in any order
fromA <- match(names(b), names(a)) %>% na.omit()
# Get positions of matching columns in b in the original order
fromB <- which(names(b) %in% names(a))
a[, fromA, with = FALSE]
b[, fromB, with = FALSE]
a[, fromA :=
b[, fromB, with = FALSE],
with = FALSE]
print(a)

R: Find matching string then copying row

I have a multi-step problem. First step: match text in one string (df1) from one column to another range of columns (df2). There is no order of which columns match and the match could occur anywhere within the range. Once the match is found, copy the df2 row match into df1. Finally, repeat for the entire column.
df1= structure(list(Assay = c("ATG_AR_trans_up","NVS_PXR_cis","BSK_VCAM1_up"), p.value = c(0.01,0.05,0.0001)), .Names = c("Assay", "p.value"),row.names = c(NA, 3L), class = "data.frame")
df1
Assay p.value
ATG_AR_trans_up 0.01
NVS_hPXR 0.065
BSK_VCAM1_up 0.001
df2=structure(list(GeneID = c("AR", "VACM1", "TR", "ER", "PXR"), Assay1= c("ATG_ARE_cis", "BSK_hEDG_VCAM1", "NVS_TR_tran", "ATG_ER_UP", "NVS_PXRE_UP"), Assay2= c("ATG_AR_trans_up", "BSK_BE3K_VCAM1", "NA", "ATG_ERE_cis", "ATG_PXRE_cis"), Assay3= c("NVS_AR_trans", "BSK_VCAM1_UP", "NA", "NVS_ERa_CIS", "NVS_PXR_cis"), Assay4= c("Tox21_AR_ARE","NA", "NA", "Tox21_ERaERb_lig", "NA")), .Names = c("GeneID", "Assay1", "Assay2", "Assay3", "Assay4"),row.names = c(NA, 5L), class = "data.frame")
df2
GeneID Assay1 Assay 2 Assay3
AR ATG_ARE_cis NVS_hAR ATG_AR_trans_up
VACM1 BSK_hEGF_CAM1 BSK_VCAM1_up BSK_VCAM1_down
TR NVS_TR_tran NA NA
ER ATG_ER_UP ATG_ERE_cis NVS_ERa_CIS
PXR ATG_PXR_down ATG_PXRE_cis NVS_hPXR
Essentially becomes
df
Assay p.value GeneID Assay1 Assay2 Assay3
ATG_AR_trans_up 0.01 AR ATG_ARE_cis NVS_hAR ATG_AR_trans_up
NVS_hPXR 0.065 PXR ATG_PXR_down ATG_PXRE_cis NVS_hPXR
BSK_VCAM1_up 0.001 VCAM1 BSK_hEGF_CAM1 BSK_VCAM1_up BSK_VCAM1_down
For brevity I shortened the df substantially, but it is around 88 Assays and 4,000 some rows to go through for just one match (there are about 30). So a my initial instinct is to loop, but I was told grep might be a helpful package (even though it is not for R 3.2.2). Any help would be appreciated though.
Since OP was interested in a grep solution, another way to do it would be,
asDF2 <- apply(df2, 1, function(r) do.call(paste, as.list(r)))
do.call(rbind, lapply(1:nrow(df1),
function(i){
matchIX <- grepl(df1$Assay[i], asDF2, ignore.case=T)
if(any(matchIX))
cbind(df1[i, ], df2[matchIX, ])
}))
The first line creates a character vector with concatenated row assay names of df2. The second line loops through df1 and finds match in asDF2 using grepl
Or equivalently,
do.call(rbind, lapply(1:nrow(df1),
function(i){
matchIX <- grepl(df1$Assay[i],
data.frame(t(df2), stringsAsFactors=F),
ignore.case=T)
if(any(matchIX))
cbind(df1[i, ], df2[matchIX, ])
} ))
Note that above variants, can match multiple rows in df2 to df1.
NOTE
To test I added new rows to original data frames as
df1 <- rbind(df1, data.frame(Assay="NoMatch", p.value=.2))
df2 <- rbind(df2,
data.frame(GeneID="My", Assay1="NVS_PXR_cis", Assay2="NA", Assay3="NA", Assay4="NA"))
This can be easily done with reshaping. I put all the assays into all caps because that was messing up the matching.
library(dplyr)
library(tidyr)
library(stringi)
df2_ID = df %>% mutate(new_ID = 1:n() )
result =
df2_ID %>%
select(new_ID, Assay1:Assay85) %>%
gather(assay_number, Assay, Assay1:Assay85) %>%
mutate(Assay =
Assay %>%
iconv(to = "ASCII") %>%
stri_trans_toupper) %>%
inner_join(df1 %>%
mutate(Assay =
Assay %>%
iconv(to = "ASCII") %>%
stri_trans_toupper)) %>%
inner_join(df2_ID)
Since you're new to R, I think you are right that the most intuitive way to do this is with a for-loop. This is not the most concise or most efficient way to do this, but it should be clear what's going on.
# Creating example data
df1 <- as.data.frame(matrix(data=c("aa", "bb", "ee", .9, .5, .7), nrow=3))
names(df1) <- c("assay", "p")
df2 <- as.data.frame(matrix(data=c("G1", "G2", "aa", "dd", "bb", "ee", "cc", "ff"), nrow=2))
names(df2) <- c("GeneID", "assay1", "assay2", "assay3")
# Building a dataframe to store output
df3 <- as.data.frame(matrix(data=NA, nrow=dim(df1)[1], ncol=dim(df2)[2]))
names(df3) <- names(df2)
# Populating dataframe with output
for(i in 1:dim(df1)[1]){
index <- which(df2==as.character(df1$assay[i]), arr.ind = TRUE)[1]
for(j in 1:dim(df3)[2]){
df3[i,j] <- as.character(df2[index,j])
}
}
df <- cbind(df1, df3)
Edit after clarification from user:
I just created a triple for loop to check your values. Basically what it does is it looks for a match. It does this by looping through all columns and all the values from that column.
However my code is not perfect yet (also a beginner in R) and I just wanted to post it so that maybe we can work something out together :).
So I first convert your data to a data.frame. After that I create an empty output which I later fill per match found.
The improvements in this method would be that with this solution the function append will also append the column names which will result in multiple useless column names.
df3 <- as.data.frame(df1)
df4 <- as.data.frame(df2)
output <- data.frame()
for(j in 1:nrow(df3)) {
match <- FALSE
for(i in 2:(ncol(df4))) {
for(p in 1:nrow(df4)) {
if((df3[j, 1] == df4[p, i]) && (match == FALSE)) {
output <- append(output, c(df3[j, ], df4[j, ]))
match <- TRUE
}
}
}
}
Assuming, you don't have any repeated entry corresponding to the entry in df1. Following is the solution for your problem:
assay <-as.matrix(df1[,1])
m1 <- as.numeric(sapply(assay, function(x){grep(x,df2[,2], ignore.case = T)}, simplify = FALSE))
m2 <- as.numeric(sapply(assay, function(x){grep(x,df2[,3], ignore.case = T)}, simplify = FALSE))
m3 <- as.numeric(sapply(assay, function(x){grep(x,df2[,4], ignore.case = T)}, simplify = FALSE))
m4 <- as.numeric(sapply(assay, function(x){grep(x,df2[,5], ignore.case = T)}, simplify = FALSE))
m1[is.na(m1)] <- 0
m2[is.na(m2)] <- 0
m3[is.na(m3)] <- 0
m4[is.na(m4)] <- 0
m0 <- (m1+m2+m3+m4)
df <- NULL
for(i in 1:nrow(df1){
df3 = cbind(df1[i,],df2[m0[i],])
df = rbind(df,df3)
}
Edit: Generalization
Since you have more than 80 rows, you can generalize it as under:
assay <-as.matrix(df1[,1])
# Storing Assay column in a list
m <- vector('list',ncol(df2[, 2:ncol(df2)]))
for(i in 1:length(m)){
m[[i]] <- as.numeric(sapply(assay, function(x){grep(x,df2[,(i+1)], ignore.case = T)}, simplify = FALSE))
}
# Getting row subscript for df2
m1 <- as.data.frame(m)
m1[is.na(m1)] <- 0
m2 <- rowSums(m1)
df <- NULL
for(i in 1:nrow(df1)){
df3 = cbind(df1[i,],df2[m2[i],])
df = rbind(df,df3)
}

R: Building a list from matching values in a data.frame

I have a 3 column data frame which looks a little like this:
id name links
1 134235 dave "34657","34563","23459"
2 23459 mary "134235","45868","45677"
3 165432 jane "134235","23459","44657"
where id and name values are unique, and links is a string of ids which indicate an association with some of the names in each row. So for example dave includes the links id 23459 which is mary so dave is connected to mary. What I need to produce is a pair list of all the connections in the data so with the example data I would output something like:
dave,mary
mary,dave
jane,dave
jane,mary
Very new to R and seen amazing things done with methods like apply and before going off and trying to replicate a solution which would look more like a javascript routine and be very inefficient I wondered if anyone could help.
One solution, using Matt's dput():
tab <- structure(list(
id = c("134235", "23459", "165432"),
name = c("dave", "mary", "jane"),
links = c("'34657', '34563', '23459'",
"'134235', '45868', '45677'",
"'134235', '23459', '44657'")),
.Names = c("id", "name", "links"),
row.names = c(NA, -3L), class = "data.frame")
conns <- function(name, links) {
paste(name, tab$name[tab$id %in% as.numeric(unlist(strsplit(gsub('\'|\"',
'', links), ',')))], sep=',')
}
connections <- unname(unlist(mapply(conns, tab$name, tab$links,
SIMPLIFY=FALSE)))
The first step should be to normalize the data, in particular, parse the strings.
You can use ddply: it applies a function
that takes a chunk of a data.frame (a row, in our case)
and transforms it in some way. You just have to write a function
that works on one row, i.e., on one string.
# Sample data
n <- 10
k <- 3
ids <- as.character(unique(round(1e5*runif(n))))
n <- length(ids)
names <- LETTERS[1:n]
links <- lapply( ids, function(u)
sample(setdiff(ids,u),k,replace=FALSE) )
links <- sapply( links, function(u)
paste( '"', paste(u,collapse='","'), '"', sep="" ) )
d <- data.frame(
id=ids,
name=names,
links=links,
stringsAsFactors=FALSE
)
library(plyr)
library(stringr)
dd <- ddply(
d,
c("id", "name"),
function(u) data.frame(
id=u$id,
name=u$name,
link=unlist(str_split( str_replace_all( u$links, '"', '' ), "," ))
))
You can then join the data, either with merge or sqldf.
library(sqldf)
sqldf("
SELECT A.name, B.name
FROM dd AS A, d AS B
WHERE A.link = B.id
")
dat<- structure(list(
id = c("134235", "23459", "165432"),
name = c("dave", "mary", "jane"),
links = c("'34657', '34563', '23459'",
"'134235', '45868', '45677'",
"'134235', '23459', '44657'")),
.Names = c("id", "name", "links"),
row.names = c(NA, -3L), class = "data.frame")
# It can all be done in base, of course...
library(stringr)
library(reshape2)
# This would be easy to do if links weren't in that format -
# one record per id-link pair would be preferable.
# Split dat$links and remove any quotes
dat.wider <- data.frame(
dat[ , c("id", "name")],
str_split_fixed(string = gsub(dat$links,
pattern = "['|\"]",
replace = ""),
pattern = ", ",
n = 3)
)
# Reshape
dat.long <- melt(dat.wider, id.var = c("id", "name"))
# Self-join - this is not quite the right method, but I'm just not
# thinking straight right now
dat.joined <- unique(merge(x = dat.long[ , c("name", "value")],
y = dat.long[ , c("id", "name")],
by.x = "value",
by.y = "id"
))
# And, finally, if you wanted vector output...
res <- with(dat.joined, paste(name.x, name.y, sep = ", "))

Resources