Add characters to existing cells with condition R - r

I have the following table:
structure(list(Id = structure(c(1L, 1L, 2L, 2L, 1L, 3L, 3L, 3L
), .Label = c("a", "b", "c"), class = "factor"), stops = c(1,
1, 1, 1, 1, 2, 2, 2)), .Names = c("Id", "stops"), row.names = c(NA,
-8L), class = "data.frame")
I would like to add to $stops new characters when the stop did not change but the $Id did.
For example, I would like to get:
structure(list(Id = structure(c(1L, 1L, 2L, 2L, 1L, 3L, 3L, 3L
), .Label = c("a", "b", "c"), class = "factor"), stops = structure(c(1L,
1L, 2L, 2L, 3L, 4L, 4L, 4L), .Label = c("1", "1-1", "1-2", "2"
), class = "factor")), .Names = c("Id", "stops"), row.names = c(NA,
-8L), class = "data.frame")
I just would like to do so if the Id is different than the previous one, and if the Stops is the same than the previous one...
I tried with mutate() but it seems I am quite far away to get something working here...

Here's a looples attempt using data.table
library(data.table)
setDT(df)[, `:=`(stops = as.character(stops), Idindx = rleid(Id))]
indx <- unique(df, by = "Idindx")[, counter := (1:.N) - 1L, by = rleid(stops)]
df[indx[counter > 0], stops := paste(stops, i.counter, sep = "-"), on = "Idindx"]
# Id stops Idindx
# 1: a 1 1
# 2: a 1 1
# 3: b 1-1 2
# 4: b 1-1 2
# 5: a 1-2 3
# 6: c 2 4
# 7: c 2 4
# 8: c 2 4
The first step is to create an unique index for each Id (as they aren't unique) and convert stops to a character (per your desired output)
Then, operating on unique indexes identify counts of same stops and join back to the original data

You could write a loop to solve your problem:
# Original data
data <- structure(list(Id = structure(c(1L, 1L, 2L, 2L, 1L, 3L, 3L, 3L
), .Label = c("a", "b", "c"), class = "factor"), stops = c(1,
1, 1, 1, 1, 2, 2, 2)), .Names = c("Id", "stops"), row.names = c(NA,
-8L), class = "data.frame")
# Add new column, which will be converted in the following loop
data$stops_new <- as.character(data$stops)
new <- 1
for(i in 2:nrow(data)) {
# Convert values of stops_new, if your specified conditions appear
if(data$Id[i] != data$Id[i - 1] & data$stops[i] == data$stops[i - 1]) {
data$stops_new[i] <- paste(data$stops_new[i], "-", new, sep = "")
# Repeat the convertion for all values with the same ID and stop-value
j <- i + 1
while(data$Id[i] == data$Id[j] & data$stops[i] == data$stops[j]) {
data$stops_new[j] <- paste(data$stops[i], "-", new, sep = "")
j <- j + 1
}
new <- new + 1
}
}
data

this is a base R solution.
create indicators showing you whether Id has changed (id.ind) and whether stops has changed (stops.ind) from the previous line (convention being that these indicators are set to "0", i.e. no change, for the first row):
stops.ind <- c(0, diff(dat$stops))
id.ind <- c(0, diff(as.numeric(dat$Id)))
create new stops vector:
stops <- new.stops <- dat$stops
row by row check whether a) there is a change in id and no change in stops or b) there is no change in either from the previous row. in case a) increase k by one and append "-k" to stops value b) use previous value of stops:
k <- 0
for(i in 2 : nrow(dat)){
if(id.ind[i] != 0 & stops.ind[i] == 0){
k <- k + 1
new.stops[i] <- paste0(stops[i], "-", k)
}
if(id.ind[i] == 0 & stops.ind[i] == 0)
new.stops[i] <- new.stops[i - 1]
}
new.stops
# [1] "1" "1" "1-1" "1-1" "1-2" "2" "2" "2"
new.dat <- data.frame(Id = dat$Id, stops = new.stops)

Related

I keep getting an output of 0 for this loop. How do I fix it?

I am trying to split my data frame into 4 smaller data frames according to the vaccine used and the diagnosis.
Here is the loop I've been trying to use:
# Define loop
gene_of_interest <- '1'
vaccines <- c("A", "B")
diagnosis <- c("Sick", "Healthy")
for (v in vaccines)
{
for (d in diagnosis)
{
# Filter data
CDR3_post_challenge_plot_prep <- CDR3_post_challenge_plot_prep[CDR3_post_challenge_plot_prep$Vaccine == v & CDR3_post_challenge_plot_prep$Diagnosis == d & CDR3_post_challenge_plot_prep$gene == gene_of_interest ,]
assign(paste0("IgH_CDR3_COI_", v, "_", d), CDR3_post_challenge_plot_prep)
}
}
The only data frame with any observations outputted from this loop is the one that satisfies the first conditions, that is, "A_Sick". But I know there should be observations in the other 2 data frames.
Here is some of what the data frame looks like:
structure(list(gene = c("1", "1", "2", "3",
"1", "1"), abundance = c(27L, 15L, 33L, 20L, 20L,
69L), Timepoint2 = c("D0.12h", "D0.12h", "D0.12h", "D0.12h",
"D0.12h", "D0.12h"), Vaccine = structure(c(2L, 3L, 3L, 2L, 3L,
2L), .Label = c("Control", "B", "A"), class = "factor"),
Diagnosis = structure(c(2L, 1L, 2L, 2L, 1L, 1L), .Label = c("Healthy",
"Sick", "UNKNOWN - Not Challenged", "UNKNOWN - Treated prior to meeting diagnostic criteria"
), class = "factor")), row.names = c(NA, -6L), class = c("tbl_df",
"tbl", "data.frame"))

Creating a new column based on the condition of others

Still very new to coding and R, I am working with some healthcare data in a data frame. There are 3 outcomes that I am interested in - Mobilised_D1, Diet_D1 and Catheter_rm_D1. I wish to create a fourth column called AnyTwo whereby if any 2 of the 3 outcomes are Y or all three outcomes are Y, then it will be T for AnyTwo.
I've managed to do this by using [] as below:
ERAS_limited[ERAS_limited$Mobilised_D1 == "Y" & ERAS_limited$Catheter_rm_D1 == "Y", "AnyTwo"] <- T
ERAS_limited[ERAS_limited$Diet_D1 == "Y" & ERAS_limited$Catheter_rm_D1 == "Y", "AnyTwo"] <- T
ERAS_limited[ERAS_limited$Diet_D1 == "Y" & ERAS_limited$Catheter_rm_D1 == "Y" & ERAS_limited$Mobilised_D1 == "Y", "AnyTwo"] <- T
dput(head(ERAS_limited))
structure(list(Mobilised_D1 = structure(c(2L, 2L, 1L, 1L, 1L,
2L), .Label = c("N", "Y"), class = "factor"), Diet_D1 = structure(c(2L,
2L, 2L, 2L, 1L, 2L), .Label = c("N", "Y"), class = "factor"),
Catheter_rm_D1 = structure(c(2L, 2L, 1L, 1L, 1L, 2L), .Label = c("N",
"Y"), class = "factor"), AnyTwo = c(TRUE, TRUE, FALSE, FALSE,
FALSE, TRUE)), row.names = c(NA, 6L), class = "data.frame")```
However, I would be keen to see if there is a more elegant way of doing this e.g. by writing a loop for my own education and curiosity.
We can use rowSums to create the logical vector
library(dplyr)
ERAS_limited %>%
mutate(AnyTwo = rowSums(.[-4] == "Y") >= 2)
In base R, it would be
ERAS_limited$AnyTwo <- rowSums(ERAS_limited[-4]) == "Y") >= 2

For loop & if else working for less data but not working for more data

Calculation inside for loop & ifelse is working when I have 100-200 rows but not working when I have 20000 rows.
Can someone help me with the FOR loop and IFELSE if something is wrong or if there is some timeout happening in R studio when running for & if-else loop
Code:
#FROM HERE IT IS NOT WORKING WHEN WE HAVE 20000 ROWS OF DATA IN FINAL DATFRAME.
#WE ARE CREATING FINAL_V1 WHICH IS POPULATING ONLY 1 ROW
#New Dataframe with Null values
Final <- structure(list(Item = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = "0S1576", class = "factor"),
LC = structure(1:6, .Label = c("MW92", "OY01", "RM11", "RS11",
"WK14", "WK15"), class = "factor"), Fiscal.Week = structure(c(1L,
1L, 1L, 1L, 1L, 1L), .Label = "2019-W24", class = "factor"),
SS = c(15L, 7L, 5L, 9L, 2L, 2L), Freq = c(3, 6, 1, 2, 1,
1), agg = c(1, 1, 1, 1, 0, 0)), row.names = c(NA, -6L), class = "data.frame")
lctolc <- structure(list(Item = structure(c(1L, 1L, 1L, 1L, 1L), .Label = "0S1576", class = "factor"),
LC = structure(c(1L, 2L, 2L, 3L, 3L), .Label = c("MW92",
"OY01", "RM11"), class = "factor"), ToLC = structure(1:5, .Label = c("OY01",
"RM11", "RS11", "WK14", "WK15"), class = "factor")), row.names = c(NA,
-5L), class = "data.frame")
df <- as.data.frame(unique(Final$Item))
Final_v1 <- NA
j <- 1
i <- 1
#SS computations
#For 1 to no of rows in df(which is having no of unique items
for(j in 1:nrow(df)) {
#copying the data from Final to Final_v1(with charater type)
Final_v1 <- Final[Final$Item == as.character(df[j,1]),]
#for 1 to the no of rows in Final_v1
for(i in 1:nrow(Final_v1)) {
if(Final_v1[i,6] <= 0)
{
Final_v1[i,7] = Final_v1[i,4]}
else
{
if(Final_v1[i,5] == '1')
{
Final_v1[i,7]=0
}
else
{
Final_v1[i,7]=Final_v1[i,4]
}
SSNew <- Final_v1[i,7]
#Leftover distribution
LCS <- lctolc$ToLC[Final_v1$Item[i] == lctolc$Item & Final_v1$LC[i] == lctolc$LC]
inds <- Final_v1$LC %in% LCS
if (any(inds))
{ Final_v1$SS[inds]<- if (SSNew == 0) {Final_v1$SS[inds]==0} else {Final_v1$SS[inds]=Final_v1$SS[inds]} }
}
}
names(Final_v1)[7] <- "SSNew"
}
Can someone help why it is not performing for 20000rows

Elegant way to write function

I have an input column (symbols) which has more than 10000 rows and they contain operator symbols and text values like ("",">","<","","****","inv","MOD","seen") as shown below in the code as values. This column doesn't contain any numbers. It only contains the value which are stated in the code.
What I would like to do is map those operator symbols ('<','>' etc) to different codes, 1) Operator_codes 2) Value_codes and have these two different codes as separate columns
I already have a working code but it is not very efficient as you can see I repeat the same operation twice. Once for Operator_codes and then for value_codes. I am sure there must be some efficient way to write this. I am new to R and not very familiar with other approach.
oper_val_concepts = function(DF){
operators_source = str_extract(.$symbols)
operators_source = as.data.frame(operators_source)
colnames(operators_source) <- c("Symbol")
operator_list = c("",">","<","-","****","inv","MOD","seen")
operator_codes = c(123L,14L,16L,13L,0L,0L,0L,0L)
value_codes=c(14L,12L,32L,123L,16L
,41L,116L,186L)
operator_code_map = map2(operator_list,operator_codes,function(x,y)c(x,y))
%>%
data.frame()
value_code_map = map2(operator_list,value_codes,function(x,y) c(x,y)) %>%
data.frame()
operator_code_map = t(operator_code_map)
value_code_map = t(value_code_map)
colnames(operator_code_map) <- c("Symbol","Code")
colnames(value_code_map) <- c("Symbol","Code")
rownames(operator_code_map) = NULL
rownames(value_code_map) = NULL
dfm<-merge(x=operators_source,y=operator_code_map,by="Symbol",all.x =
TRUE)
dfm1<-merge(x=operators_source,y=value_code_map,by="Symbol",all.x = TRUE)
}
t1 = oper_val_concepts(test)
dput command output is
structure(list(Symbols = structure(c(2L, 3L, 1L, 4L, 2L, 3L,
5L, 4L, 6L), .Label = c("****", "<", ">", "inv", "mod", "seen"
), class = "factor")), .Names = "Symbols", row.names = c(NA,-9L), class =
"data.frame")
I am expecting an output to be two columns in a dataframe as shown below.
Based on what I am understanding, it seems like you want to create a dataframe that will act as a key (see key below). Once you have this, you can just join the dataframe that just contains symbols with this key dataframe.
df <- structure(list(Symbols = structure(c(2L, 3L, 1L, 4L, 2L, 3L,
5L, 4L, 6L), .Label = c("****", "<", ">", "inv", "mod", "seen"
), class = "factor")), .Names = "Symbols", row.names = c(NA, -9L), class = "data.frame")
key <- data.frame(Symbols = c("",">","<","-","****","inv","mod","seen"),
Oerator_code_map = c(123L,14L,16L,13L,0L,0L,0L,0L),
value_code_map = c(14L,12L,32L,123L,16L,41L,116L,186L))
df %>% left_join(key, by = "Symbols")
output
Symbols Oerator_code_map value_code_map
1 < 16 32
2 > 14 12
3 **** 0 16
4 inv 0 41
5 < 16 32
6 > 14 12
7 mod 0 116
8 inv 0 41
9 seen 0 186

How do I write a function to manipulate several dataframes the same way?

Complete novice. I do not know how to write a function. I have several dataframes that all need to be manipulated in the same way and the output should be dataframes with the same names. I have functioning code that can manipulate a single dataframe. I would like to be able to manipulate several at once.
Here are 2 example df's:
ex1 <- structure(list(info1 = c("Day", "2018.04.03 10:47:33", "2018.04.03 11:20:04", "2018.04.03 11:35:04"), info2 = c("Status_0", "Ok", "Ok", "Ok"
), X = c(200L, 1L, 2L, 3L), X.1 = c(202.5, 1, 2, 3), X.2 = c(205L,
1L, 2L, 3L), X.3 = c(207.5, 1, 2, 3), X.4 = c(210L, 1L, 2L, 3L
), X.5 = c(212.5, 1, 2, 3), X.6 = c(215L, 1L, 2L, 3L)), class = "data.frame", row.names = c(NA, -4L))
ex2 <- structure(list(info1 = c("Day", "2018.04.10 12:47:33", "2018.04.10 13:20:04", "2018.04.10 13:35:04"), info2 = c("Status_0", "Ok", "Ok", "Ok"
), X = c(200L, 1L, 2L, 3L), X.1 = c(202.5, 1, 2, 3), X.2 = c(205L,
1L, 2L, 3L), X.3 = c(207.5, 1, 2, 3), X.4 = c(210L, 1L, 2L, 3L
), X.5 = c(212.5, 1, 2, 3), X.6 = c(215L, 1L, 2L, 3L)), class = "data.frame", row.names = c(NA, -4L))
Here is the functioning code to manipulate 'ex1'
library(tidyverse)
library(lubridate)
colnames(ex1) <- ex1[1,]
ex1 <- ex1 %>%
slice(-1) %>%
rename(Date.Time = "Date/Time") %>%
mutate(timestamp = parse_date_time(Date.Time, "%Y.%m.%d %H:%M:%S")) %>%
select(timestamp, Date.Time, everything()) %>% select(-Date.Time) %>%
select(-c(Status_0:"202.5", "212.5":"215"))
colnames(ex1)[-1] <- paste("raw", colnames(ex1)[-1], sep = "_")
Secondary question: let's say I wanted to change the function so it accepted a df, but also a type (i.e., raw or comp) and the function input would be tidydatafunc(df, type). If I input type=comp it would change the last line of the code where I have "raw" to "comp". How could I change the function to accomodate this?
Any help is greatly appreciated. I'm sure this is basic stuff for most of you!
Wrap your script in function and specify params.
my_fun <- function(df, type = 'comp') {
# basic input validation is extremely useful
stopifnot(is.data.frame(df))
stopifnot(is.character(type))
colnames(df) <- df[1,]
ex1 <- df %>%
slice(-1) %>%
rename(Date.Time = "Date/Time") %>%
mutate(timestamp = parse_date_time(Date.Time, "%Y.%m.%d %H:%M:%S")) %>%
select(timestamp, Date.Time, everything()) %>% select(-Date.Time) %>%
select(-c(Status_0:"202.5", "212.5":"215"))
# pass the character type
colnames(df)[-1] <- paste(type, colnames(df)[-1], sep = "_")
return(df)
}
Then you can use it.
my_fun(ex1, "comp") # view
new_ex1 <- my_fun(ex1, "comp") # save to variable new_ex1

Resources