Related
I am trying to split my data frame into 4 smaller data frames according to the vaccine used and the diagnosis.
Here is the loop I've been trying to use:
# Define loop
gene_of_interest <- '1'
vaccines <- c("A", "B")
diagnosis <- c("Sick", "Healthy")
for (v in vaccines)
{
for (d in diagnosis)
{
# Filter data
CDR3_post_challenge_plot_prep <- CDR3_post_challenge_plot_prep[CDR3_post_challenge_plot_prep$Vaccine == v & CDR3_post_challenge_plot_prep$Diagnosis == d & CDR3_post_challenge_plot_prep$gene == gene_of_interest ,]
assign(paste0("IgH_CDR3_COI_", v, "_", d), CDR3_post_challenge_plot_prep)
}
}
The only data frame with any observations outputted from this loop is the one that satisfies the first conditions, that is, "A_Sick". But I know there should be observations in the other 2 data frames.
Here is some of what the data frame looks like:
structure(list(gene = c("1", "1", "2", "3",
"1", "1"), abundance = c(27L, 15L, 33L, 20L, 20L,
69L), Timepoint2 = c("D0.12h", "D0.12h", "D0.12h", "D0.12h",
"D0.12h", "D0.12h"), Vaccine = structure(c(2L, 3L, 3L, 2L, 3L,
2L), .Label = c("Control", "B", "A"), class = "factor"),
Diagnosis = structure(c(2L, 1L, 2L, 2L, 1L, 1L), .Label = c("Healthy",
"Sick", "UNKNOWN - Not Challenged", "UNKNOWN - Treated prior to meeting diagnostic criteria"
), class = "factor")), row.names = c(NA, -6L), class = c("tbl_df",
"tbl", "data.frame"))
Calculation inside for loop & ifelse is working when I have 100-200 rows but not working when I have 20000 rows.
Can someone help me with the FOR loop and IFELSE if something is wrong or if there is some timeout happening in R studio when running for & if-else loop
Code:
#FROM HERE IT IS NOT WORKING WHEN WE HAVE 20000 ROWS OF DATA IN FINAL DATFRAME.
#WE ARE CREATING FINAL_V1 WHICH IS POPULATING ONLY 1 ROW
#New Dataframe with Null values
Final <- structure(list(Item = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = "0S1576", class = "factor"),
LC = structure(1:6, .Label = c("MW92", "OY01", "RM11", "RS11",
"WK14", "WK15"), class = "factor"), Fiscal.Week = structure(c(1L,
1L, 1L, 1L, 1L, 1L), .Label = "2019-W24", class = "factor"),
SS = c(15L, 7L, 5L, 9L, 2L, 2L), Freq = c(3, 6, 1, 2, 1,
1), agg = c(1, 1, 1, 1, 0, 0)), row.names = c(NA, -6L), class = "data.frame")
lctolc <- structure(list(Item = structure(c(1L, 1L, 1L, 1L, 1L), .Label = "0S1576", class = "factor"),
LC = structure(c(1L, 2L, 2L, 3L, 3L), .Label = c("MW92",
"OY01", "RM11"), class = "factor"), ToLC = structure(1:5, .Label = c("OY01",
"RM11", "RS11", "WK14", "WK15"), class = "factor")), row.names = c(NA,
-5L), class = "data.frame")
df <- as.data.frame(unique(Final$Item))
Final_v1 <- NA
j <- 1
i <- 1
#SS computations
#For 1 to no of rows in df(which is having no of unique items
for(j in 1:nrow(df)) {
#copying the data from Final to Final_v1(with charater type)
Final_v1 <- Final[Final$Item == as.character(df[j,1]),]
#for 1 to the no of rows in Final_v1
for(i in 1:nrow(Final_v1)) {
if(Final_v1[i,6] <= 0)
{
Final_v1[i,7] = Final_v1[i,4]}
else
{
if(Final_v1[i,5] == '1')
{
Final_v1[i,7]=0
}
else
{
Final_v1[i,7]=Final_v1[i,4]
}
SSNew <- Final_v1[i,7]
#Leftover distribution
LCS <- lctolc$ToLC[Final_v1$Item[i] == lctolc$Item & Final_v1$LC[i] == lctolc$LC]
inds <- Final_v1$LC %in% LCS
if (any(inds))
{ Final_v1$SS[inds]<- if (SSNew == 0) {Final_v1$SS[inds]==0} else {Final_v1$SS[inds]=Final_v1$SS[inds]} }
}
}
names(Final_v1)[7] <- "SSNew"
}
Can someone help why it is not performing for 20000rows
I have a lookup table in R that I am trying to figure out how to implement. The challenge for me is that it involves continuous values or ranges of data. If the value falls inbetween I'd like it to pick the right value.
I want to use the two continuous 'GRADE', 'SAT' variables plus the categorical 'TYPE' value to assign a 'GROUP' value. This big block of code looks intimidating but these are tiny tiny tables.
Any advice is appreciated!!!!
#lookup table code for recreating dataframe
structure(list(Type = structure(c(1L, 2L, 1L, 1L), .Label = c("A",
"B"), class = "factor"), min_grade = c(93L, 85L, 93L, 80L), max_grade = c(100L,
93L, 100L, 92L), min_sat = c(600L, 700L, 400L, 600L), max_sat = c(800L,
800L, 599L, 800L), Group = structure(c(1L, 1L, 2L, 3L), .Label = c("A",
"B", "C"), class = "factor")), .Names = c("Type", "min_grade",
"max_grade", "min_sat", "max_sat", "Group"), class = "data.frame", row.names = c(NA,
-4L))
#example ----- desired value is in the 'GROUP' column so this would be NULL before I used the lookup table
structure(list(Name = structure(c(3L, 1L, 2L, 4L), .Label = c("Jack",
"James", "John", "Jordan"), class = "factor"), Grade = c(95L,
95L, 92L, 93L), Sat = c(701L, 500L, 800L, 800L), Type = structure(c(1L,
1L, 1L, 2L), .Label = c("A", "B"), class = "factor"), Group = structure(c(1L,
2L, 3L, 1L), .Label = c("A", "B", "C"), class = "factor")), .Names = c("Name",
"Grade", "Sat", "Type", "Group"), class = "data.frame", row.names = c(NA,
-4L))
how abt this?
ltab <- structure(list(Type = structure(c(1L, 2L, 1L, 1L), .Label = c("A",
"B"), class = "factor"), min_grade = c(93L, 85L, 93L, 80L), max_grade = c(100L,
93L, 100L, 92L), min_sat = c(600L, 700L, 400L, 600L), max_sat = c(800L,
800L, 599L, 800L), Group = structure(c(1L, 1L, 2L, 3L), .Label = c("A",
"B", "C"), class = "factor")), .Names = c("Type", "min_grade",
"max_grade", "min_sat", "max_sat", "Group"), class = "data.frame", row.names = c(NA,
-4L))
dat <- structure(list(Name = structure(c(3L, 1L, 2L, 4L), .Label = c("Jack",
"James", "John", "Jordan"), class = "factor"), Grade = c(95L,
95L, 92L, 93L), Sat = c(701L, 500L, 800L, 800L), Type = structure(c(1L,
1L, 1L, 2L), .Label = c("A", "B"), class = "factor")), .Names = c("Name",
"Grade", "Sat", "Type"), class = "data.frame", row.names = c(NA,
-4L))
library(plyr)
mdat <- adply(merge(dat, ltab, by="Type", all=T), 1, function(x) {
c(FallsIn=x$Grade > x$min_grade & x$Grade <= x$max_grade & x$Sat > x$min_sat & x$Sat <= x$max_sat)
})
mdat[mdat$FallsIn,]
thinking about generalizing, are there going to be more continuous variables that you need to check?
EDIT: could not edit OP post so taking OP's comment into account is how I would tackle an example of "categorizing multidimensional continuous random variables"
so that these keywords will flag up in future searches
breaks <- list(Var1=c(0, 0.25, 1),
Var2=c(0, 0.5, 1),
Var3=c(0, 0.25, 0.75, 1))
#generate this on the fly
genIntv <- function(x) {
ret <- paste0("(", x[1:(length(x)-1)],", ",x[2:length(x)], "]")
names(ret) <- 1:(length(x)-1)
ret
}
lookupTbl <- data.frame(expand.grid(lapply(breaks, genIntv), stringsAsFactors=F),
Group=LETTERS[1:12])
lookupTbl2 <- data.frame(expand.grid(lapply(breaks, function(x) 1:(length(x)-1)), stringsAsFactors=F),
Group=LETTERS[1:12])
#data set
dat <- data.frame(Var1=c(0.1, 0.76), Var2=c(0.5, 0.75), Var3=c(0.25,0.9))
binDat <- do.call(cbind, setNames(lapply(1:ncol(dat), function(k)
.bincode(dat[,k], breaks[[k]], T, T)),colnames(dat)))
merge(binDat, lookupTbl2, all.x=T, all.y=F)
good to learn if someone else has better approaches
If you have small data, a full join should be fine.
library(dplyr)
result =
example %>%
select(-Type) %>%
full_join(look_up) %>%
filter(min_grade < Grade & Grade <= max_grade &
min_sat < Sat & Sat <= max_sat)
I have a dataset Data like below:
dput(Data)
structure(list(FN = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L), .Label = "20131202-0985 ", class = "factor"), Values = structure(c(1L,
8L, 7L, 6L, 5L, 9L, 2L, 4L, 3L), .Label = c("|639778|21|NANYANG CIRCLE|103.686721631628|1.34640300329567",
"|8121|B01|SOMERSET STN", "|96942883", "|SN30|SMRT\n", "CENTRAL",
"FOUR SEASONS HOTEL", "HOTEL", "IKEA", "nanyang avenue"), class = "factor"),
IND = structure(c(4L, 1L, 1L, 1L, 1L, 6L, 3L, 2L, 5L), .Label = c("BN",
"BR", "BS", "LOC", "PN", "RN"), class = "factor")), .Names = c("FN",
"Values", "IND"), class = "data.frame", row.names = c(NA, -9L
))
I wanted the above dataset to be converted as in the below format as a Data Frame(out_data).
Presently my Data has 3 columns - and need to covert these into 16 columns in below format.
I need to rehape my input - to exactly given in the screenshot as data frame.
I cannot change the below structure -
colnames(out_data) <- ("FN","H_BLK","S_N/R_N","B_N","FL_N","U_N","PC","XC","YC","BS","BRF","LCT_DEC","BRN","BO PN","S_TY_CD")
The Multiple value columns in the inputnand are always in the below Format:
|639778|21|NANYANG CIRCLE|103.686721631628|1.34640300329567 -
|PC|H_BLK|S_N/R_N|XC|YC
|8121|B01|SOMERSET STN -> |BS|BRF|LCT_DEC
|SN30|SMRT ------> |BRN|BO
If the
IND =LOC - then |PC|H_BLK|S_N/R_N|XC|YC` get updated with S_TY_CD=LOC
IND= BN - then B_N column should be updated with S_TY_CD=BN
IND= RN - then _N/R_N column should be updated with S_TY_CD=RN
IND= BS then `|BS|BRF|LCT_DEC` should be updated with S_TY_CD=BS
IND= BR then `|BRN|BO` should be updated with S_TY_CD=BR
IND= PN then PN with S_TY_CD=PN
Is there an efficient way of doing this.
Here's one method of transformation. First I define some helper functions for the various sub problems.
#define out cols
outcols<-c("FN", "H_BLK", "S_N/R_N", "B_N", "FL_N", "U_N", "PC",
"XC", "YC", "BS", "BRF", "LCT_DEC", "BRN","BO","PN","S_TY_CD")
#identify parts for each compound value
namevals <- function(ind, vals) {
names<-if (ind=="LOC") {
c("PC","H_BLK","S_N/R_N","XC","YC")
} else if (ind=="BN") {
c("B_N")
} else if (ind=="RN") {
c("S_N/R_N")
} else if (ind=="BS") {
c("BS","BRF","LCT_DEC")
} else if (ind=="BR") {
c("BRN","BO")
} else if (ind=="PN") {
c("PN")
}
stopifnot(length(names)==length(vals))
stopifnot(all(names %in% outcols))
names(vals)<-names
vals
}
#add missing values for row
fillrow <- function(nvals) {
r<-rep(NA, length(outcols))
r[match(names(nvals), outcols)]<-nvals
r
}
Now I apply these to each row of the data with mapply to return a character vector. Here we make sure to split the "values" column on the pipe and remove the leading pipe.
#combine rows into character matrix
dt<-mapply(function(fn,vals,ind){
x<-c(FN=fn,namevals(ind, vals), "S_TY_CD"=ind)
fillrow(x)
},
as.character(Data$FN),
strsplit(gsub("^\\|","",as.character(Data$Values)),"|", fixed=T),
as.character(Data$IND)
)
Finally we tidy the data up so it can be written out to a file with write.table. Note that all missing values are true R NA values. In the write.table, you can set na = "" if you'd rather they print out as blank values than the default "NA" value.
#turn matrix into data.frame with proper names
dd<-data.frame(unname(t(dt)), stringsAsFactors=F)
names(dd)<-outcols
dd
(still) new to r, and very confused as to how I should accomplish multiple melts of my data. Here is a subset:
df <- structure(list(Subject = c(101L, 101L, 101L, 102L, 102L, 102L
), Condition = structure(c(1L, 1L, 1L, 2L, 2L, 2L), .Label = c("apass",
"vpas"), class = "factor"), FreqCode = structure(c(1L, 1L, 1L,
2L, 2L, 2L), .Label = c("LessVerbal", "MoreVerbal"), class = "factor"),
Item = c(1L, 4L, 7L, 1L, 4L, 7L), Len = c(80L, 68L, 85L,
68L, 85L, 79L), R1_1.RT = c(237L, 203L, 207L, 336L, 487L,
340L), R1_2.RT = c(177L, 225L, 162L, 634L, 590L, 347L), R1_3.RT = c(200L,
226L, 212L, 707L, 653L, 379L), R1.RT = c(614L, 654L, 581L,
1677L, 1730L, 1066L), R1_1 = structure(c(1L, 1L, 1L, 1L,
1L, 1L), .Label = "The", class = "factor"), R1_2 = structure(c(3L,
1L, 2L, 1L, 2L, 4L), .Label = c("antique", "course", "new",
"road"), class = "factor"), R1_3 = structure(c(4L, 1L, 2L,
1L, 2L, 3L), .Label = c("car", "materials", "surfaces", "technology"
), class = "factor"), R1 = structure(c(3L, 1L, 2L, 1L, 2L,
4L), .Label = c("The antique car", "The course materials",
"The new technology", "The road surfaces"), class = "factor")), .Names = c("Subject",
"Condition", "FreqCode", "Item", "Len", "R1_1.RT", "R1_2.RT",
"R1_3.RT", "R1.RT", "R1_1", "R1_2", "R1_3", "R1"), class = "data.frame", row.names =
c(NA,
-6L))
My goal is to get output that (in part) looks like this:
Region RT WordRegion Word
R1_1.RT 237 R1_1 the
...
R1_2.RT 177 R1_2 new
...
EDIT: The variable ending with ".RT" (e.g., R1_1.RT) are Region names and will be melted into a Region column. The variables ending in numbers (e.g., R1_1) correspond exactly to the Region names and their associated values. I want them to be melted alongside the Region names so that I can analyze them in relation to the Region column
In the first part of the code, I melt all of the values into a Region column and change the value to RT. This seems to work fine:
#long transform (with individual regions at end)
SmallMelt1 = melt(df, measure.vars = c("R1_1.RT", "R1_2.RT", "R1_3.RT", "R1.RT"), var = "Region")
#change newly created column name to "RT" (note:you have to change the number in [] to match your data)
colnames(SmallMelt1)[11 ] <- "RT"
But I don't get how to simultaneously melt another span of variables such that they will line up vertically with the first span. I want to do something like this, after the first melt, but it does not work:
#Second Melt for region names (doesn't work)
SmallMelt2 = melt(SmallMelt1, measure.vars = c("R1_1", "R1_2", "R1_3", "R1"), var = "WordRegion")
#Change name to Word
colnames(SmallMelt2)[9] <- "Word" #add col number for "value" here
Please let me know if you need any clarification. I hope someone can help... thanks in advance - DT
So, after consulting with someone off-list, I found the solution. My mistake was that I was trying to run the second step on the output of the first step. By running the two steps independently on the original data and then concatenating, I get the right result.
SmallMelt1 = melt(df, measure.vars = c("R1_1.RT", "R1_2.RT", "R1_3.RT", "R1.RT"), var = "Region")
SmallMelt2 = melt(df, measure.vars = c("R1_1", "R1_2", "R1_3", "R1"), var = "WordRegion")
SmallMelt3=cbind(SmallMelt1,SmallMelt2[,11])