3D euclidean distance to identify unknown samples - r

I have this dataframe called mydf where I have three principal covariates (PCA.1,PCA.2, PCA.3). I want to get the 3d distance matrix and get the shortest euclidean distance between all the compared Samples. In another dataframe called myref, I have some known identity of Samples and some unknown samples. By calculating the shortest euclidean distance from mydf, I want to assign the known Identity to the unknown samples. Can someone please help me get this done.
mydf
mydf <- structure(list(Sample = c("1", "2", "4", "5", "6", "7", "8",
"9", "10", "12"), PCA.1 = c(0.00338, -0.020373, -0.019842, -0.019161,
-0.019594, -0.019728, -0.020356, 0.043339, -0.017559, -0.020657
), PCA.2 = c(0.00047, -0.010116, -0.011532, -0.011582, -0.013245,
-0.011751, -0.010299, -0.005801, -0.01, -0.011334), PCA.3 = c(-0.008787,
0.001412, 0.003751, 0.00371, 0.004242, 0.003738, 0.000592, -0.037229,
0.004307, 0.00339)), .Names = c("Sample", "PCA.1", "PCA.2", "PCA.3"
), row.names = c(NA, 10L), class = "data.frame")
myref
myref<- structure(list(Sample = c("1", "2", "4", "5", "6", "7", "8",
"9", "10", "12"), Identity = c("apple", "unknown", "ball", "unknown",
"unknown", "car", "unknown", "cat", "unknown", "dog")), .Names = c("Sample",
"Identity"), row.names = c(NA, 10L), class = "data.frame")

uIX = which(myref$Identity == "unknown")
dMat = as.matrix(dist(mydf[, -1])) # Calculate the Euclidean distance matrix
nn = apply(dMat, 1, order)[2, ] # For each row of dMat order the values increasing values.
# Select nearest neighbor (it is 2, because 1st row will be self)
myref$Identity[uIX] = myref$Identity[nn[uIX]]
Note that the above code will set some identities to unknown. If instead you want to match to the nearest neighbor with a known identity, change the second line to
dMat[uIX, uIX] = Inf

Related

Loop for creating multiple new 3 level variables from another 5 level variable

I'm looking for a way to generate multiple 3-level variables from an older 5-level variable, while keeping the old variables.
This is how it is now:
structure(list(Quesiton1 = c("I", "5", "4", "4"), Question2 = c("I",
"5", "4", "4"), Question3 = c("I", "3", "2", "4")), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -4L))
I would like this:
structure(list(Quesiton1 = c("I", "5", "4", "4"), Question2 = c("I",
"5", "4", "4"), Question3 = c("I", "3", "2", "4"), Question1_3l = c("NA",
"3", "3", "3"), Question2_3l = c("NA", "3", "3", "3"), Question3_3l = c("NA",
"2", "1", "3")), row.names = c(NA, -4L), class = c("tbl_df",
"tbl", "data.frame"))
I have this code to recode the 5-level variable
df2 %>%
mutate_at(vars(Question1, Question2, Question3), recode,'1'=1, '2'=1, '3'=3, '4'=5, '5'=5, 'l' = NA)
But what I want to do is to keep the old variable and generate the 3 level variable into something like Question1_3l, Question2_3l, Question3_3l.
It shouldn't be too difficult. In Stata it looks something like this:
foreach i of varlist ovsat-not_type_number {
local lbl : variable label `i'
recode `i' (1/2=1)(3=2)(4/5=3), gen(`i'_3l)
}
Thank you.
Not the most elegant, not the fastest (but still pretty fast), not the most idiomatic, but this does what you want (I think) and should be easy to read and customize.
dt <- structure(list(Quesiton1 = c("I", "5", "4", "4"),
Question2 = c("I", "5", "4", "4"),
Question3 = c("I", "3", "2", "4")),
class = c("tbl_df", "tbl", "data.frame"),
row.names = c(NA, -4L))
#transfor your data into a data.table
setDT(dt)
#define the names of the columns that you want to recode
vartoconv <- names(dt)
#define the names of the recoded columns
newnames <- paste0(vartoconv, "_3l")
#define an index along the vector of the names of the columns to recode
for(varname_loopid in seq_along(vartoconv)){
#identify the name of the column to recode for each iteration
varname_loop <- vartoconv[varname_loopid]
#identify the name of the recoded column for each iteration
newname_loop <- newnames[varname_loopid]
#create the recoded variable by using conditionals on the variable to recode
dt[get(varname_loop) %in% c(1, 2), (newname_loop) := 1]
dt[get(varname_loop) == 3, (newname_loop) := 2]
dt[get(varname_loop) %in% c(4, 5), (newname_loop) := 3]
}
Try:
library(tidyverse)
library(stringr)
df2 <- replicate(6, sample(as.character(1:5), 50, replace = TRUE), simplify = "matrix") %>%
as_tibble(.name_repair = ~str_c("Question", 1:6))
df2 %>%
mutate_at(vars(Question1:Question3),
~case_when(.x %in% c('1', '2') ~ 1L, # 1L means integer 1
.x %in% c('3') ~ 3L,
.x %in% c('4', '5') ~ 5L,
TRUE ~ as.integer(NA)))

Row wise parallel Processing in R?

I am working on large data sets, for which i have written a code to perform row by row operation on a data frame, which is sequential. The process is slow.
I am trying to perform the operation using parallel processing to make it fast.
Here is code
library(geometry)
# Data set - a
data_a = structure(c(10.4515034409741, 15.6780890052356, 12.5581992918563,
9.19067944250871, 14.4459166666667, 11.414, 17.65325, 12.468,
11.273, 15.5945), .Dim = c(5L, 2L), .Dimnames = list(c("1", "2",
"3", "4", "5"), c("a", "b")))
# Data set - b
data_b = structure(c(10.4515034409741, 15.6780890052356, 12.5581992918563,
9.19067944250871, 14.4459166666667, 11.3318076923077, 13.132273830156,
6.16003995082975, 11.59114820435, 10.9573192090395, 11.414, 17.65325,
12.468, 11.273, 15.5945, 11.5245, 12.0249, 6.3186, 13.744, 11.0921), .Dim = c(10L,
2L), .Dimnames = list(c("1", "2", "3", "4", "5", "6", "7", "8", "9", "10"), c("a",
"b")))
conv_hull_1 <- convhulln( data_a, options = "FA") # Draw Convex Hull
test = c()
for (i in 1:nrow(data_b)){
df = c()
con_hull_all <- inhulln(conv_hull_1, matrix(data_b[i,], ncol = 2))
df$flag <- ifelse(con_hull_all[1] == TRUE , 0 , ifelse(con_hull_all[1] == FALSE , 1, 2))
test <- as.data.frame(rbind(test, df))
print(i)
}
test
Is there any way to parallelize row wise computation?
As you can observe, for small datasets the computational time is really low, but as soon as i increase the data size, the computation time increases drastically.
Can you provide solution with the code.
Thanks in advance.
You could take advantage of the parameter to the inhulln function. This allows more than one row of points to be tested to be passed in.
I've tried the code below on a 320,000 row matrix that I made from the original data and it's quick.
library(geometry)
library(dplyr)
# Data set - a
data_a = structure(
c(
10.4515034409741,
15.6780890052356,
12.5581992918563,
9.19067944250871,
14.4459166666667,
11.414,
17.65325,
12.468,
11.273,
15.5945
),
.Dim = c(5L, 2L),
.Dimnames = list(c("1", "2",
"3", "4", "5"), c("a", "b"))
)
# Data set - b
data_b = structure(
c(
10.4515034409741,
15.6780890052356,
12.5581992918563,
9.19067944250871,
14.4459166666667,
11.3318076923077,
13.132273830156,
6.16003995082975,
11.59114820435,
10.9573192090395,
11.414,
17.65325,
12.468,
11.273,
15.5945,
11.5245,
12.0249,
6.3186,
13.744,
11.0921
),
.Dim = c(10L,
2L),
.Dimnames = list(c(
"1", "2", "3", "4", "5", "6", "7", "8", "9", "10"
), c("a",
"b"))
)
conv_hull_1 <- convhulln( data_a, options = "FA") # Draw Convex Hull
#Make a big data_b
for (i in 1:15) {
data_b = rbind(data_b, data_b)
}
In_Or_Out <- inhulln(conv_hull_1, data_b)
result <- data.frame(data_b) %>% bind_cols(InOrOut=In_Or_Out)
I use dplyr::bind_cols to bind the in or out result to a data frame version of the original data so you might need some changes for your specific environment.

Convert column types to their read_csv() column type in R

One of my favorite things about library(readr) and the read_csv() function in R is that it almost always sets the column types of my data to the correct class. However, I am currently working with an API in R that returns data to me as a dataframe of all character classes, even if the data is clearly numbers. Take this dataframe for example, which has some sports data:
dput(mydf)
structure(list(isUnplayed = c("false", "false", "false"), isInProgress =
c("false", "false", "false"), isCompleted = c("true", "true", "true"), awayScore = c("106",
"95", "95"), homeScore = c("94", "97", "111"), game.ID = c("31176",
"31177", "31178"), game.date = c("2015-10-27", "2015-10-27",
"2015-10-27"), game.time = c("8:00PM", "8:00PM", "10:30PM"),
game.location = c("Philips Arena", "United Center", "Oracle Arena"
), game.awayTeam.ID = c("88", "86", "110"), game.awayTeam.City = c("Detroit",
"Cleveland", "New Orleans"), game.awayTeam.Name = c("Pistons",
"Cavaliers", "Pelicans"), game.awayTeam.Abbreviation = c("DET",
"CLE", "NOP"), game.homeTeam.ID = c("91", "89", "101"), game.homeTeam.City = c("Atlanta",
"Chicago", "Golden State"), game.homeTeam.Name = c("Hawks",
"Bulls", "Warriors"), game.homeTeam.Abbreviation = c("ATL",
"CHI", "GSW"), quarterSummary.quarter = list(structure(list(
`#number` = c("1", "2", "3", "4"), awayScore = c("25",
"23", "34", "24"), homeScore = c("25", "18", "23", "28"
)), .Names = c("#number", "awayScore", "homeScore"), class = "data.frame", row.names = c(NA,
4L)), structure(list(`#number` = c("1", "2", "3", "4"), awayScore = c("17",
"23", "28", "27"), homeScore = c("26", "20", "25", "26")), .Names = c("#number",
"awayScore", "homeScore"), class = "data.frame", row.names = c(NA,
4L)), structure(list(`#number` = c("1", "2", "3", "4"), awayScore = c("35",
"14", "26", "20"), homeScore = c("39", "20", "35", "17")), .Names = c("#number",
"awayScore", "homeScore"), class = "data.frame", row.names = c(NA,
4L)))), .Names = c("isUnplayed", "isInProgress", "isCompleted",
"awayScore", "homeScore", "game.ID", "game.date", "game.time",
"game.location", "game.awayTeam.ID", "game.awayTeam.City", "game.awayTeam.Name",
"game.awayTeam.Abbreviation", "game.homeTeam.ID", "game.homeTeam.City",
"game.homeTeam.Name", "game.homeTeam.Abbreviation", "quarterSummary.quarter"
), class = "data.frame", row.names = c(NA, 3L))
It is quite a hassle to deal with this dataframe once it is returned by the API, given the class types. I've come up with a sort of a hack to update the column classes, which is as follows:
write_csv(mydf, 'mydf.csv')
mydf <- read_csv('mydf.csv')
By writing to CSV and then re-reading the CSV using read_csv(), the dataframe columns update. Unfortunately I am left with a CSV file in my directory that I don't want. Is there a way to update the columns of an R dataframe to their 'read_csv()' column classes, without actually having to write the CSV?
Any help is appreciated!
You don't need to write and read the data if you just want readr to guess you column type. You could use readr::type_convert for that:
iris %>%
dplyr::mutate(Sepal.Width = as.character(Sepal.Width)) %>%
readr::type_convert() %>%
str()
For comparison:
iris %>%
dplyr::mutate(Sepal.Width = as.character(Sepal.Width)) %>%
str()
try this code, type.convert convert a character vector to logical, integer, numeric, complex or factor as appropriate.
indx <- which(sapply(df, is.character))
df[, indx] <- lapply(df[, indx], type.convert)
indx <- which(sapply(df, is.factor))
df[, indx] <- lapply(df[, indx], as.character)

Iterating over data.table in lapply or for loop

I basically am wondering why it makes a difference if you iterate over a vector created from a data.table or over a basic vector and why that is the case.
I think basically it's quite similar to this question, but I can narrow it down to a more basic example.
dt <- setDT(structure(list(File.Name = c("file1.xlsx", "file2.xlsx", "file3.xlsx")
, Split.ID = structure(c(1L, 1L, 1L)
, .Label = c("1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11","12", "13", "14", "15", "16", "17", "18", "19", "20"), class = "factor"))
, .Names = c("File.Name", "Split.ID")
, class = c("data.table", "data.frame"), row.names = c(NA, -3L)))
for(file in as.vector(dt[1:3,1])){
print(file)
}
for(i in letters[1:4]){print(i)}
As can be seen from the output, in the first for-loop it just prints out all values in the first iteration and in the second loop it loops over the distinct letters.
I am trying to use the file names in connection with readxl, so vectorizing the function is not really an option, since I want to do it sequentially. Additionally I would like to keep the list of files as an data.table to be able to split in different parts.

time series barplot in R

i have a time series data like this:
x <- structure(list(date = structure(c(1264572000, 1266202800, 1277362800,
1277456400, 1277859600, 1278032400, 1260370800, 1260892800, 1262624400,
1262707200), class = c("POSIXt", "POSIXct"), tzone = ""), data = c(-0.00183760994446658,
0.00089738603087497, 0.000423513598318936, 0, -0.00216496690393131,
-0.00434836817931339, -0.0224199153445617, 0.000583823085470003,
0.000353088613905206, 0.000470295331234771)), .Names = c("date",
"data"), row.names = c("1", "2", "3",
"4", "5", "6", "7", "8", "9", "10"
), class = "data.frame")
and I will make a barplot of this dataset whereby each bar stands for each date (if there are no datas for timespan, there should be gaps).
Can Anyone help me?
Using ggplot: (Note that you have to provide stat="identity" to geom_bar to prevent it from summarising the data and creating a histogram).
library(ggplot2)
ggplot(x, aes(x=date, y=data)) + geom_bar(stat="identity")
And if you are inclined to use base graphics:
plot(x$date, x$data, type="h")

Resources