Conditionally replace cells in data frame based on another data frame - r

In the interest of learning better coding practices, can anyone show me a more efficient way of solving my problem? Maybe one that doesn't require new columns...
Problem: I have two data frames: one is my main data table (t) and the other contains changes I need to replace in the main table (Manual_changes). Example: Sometimes the CaseID is matched with the wrong EmployeeID in the file.
I can't provide the main data table, but the Manual_changes file looks like this:
Manual_changes = structure(list(`Case ID` = c(46605, 25321, 61790, 43047, 12157,
16173, 94764, 38700, 41798, 56198, 79467, 61907, 89057, 34232,
100189), `Employee ID` = c(NA, NA, NA, NA, NA, NA, NA, NA, 906572,
164978, 145724, 874472, 654830, 846333, 256403), `Age in Days` = c(3,
3, 3, 12, 0, 0, 5, 0, NA, NA, NA, NA, NA, NA, NA)), row.names = c(NA,
-15L), class = c("tbl_df", "tbl", "data.frame"))
temp = merge(t, Manual_changes, by = "Case ID", all.x = TRUE)
temp$`Employee ID.y` = ifelse(is.na(temp$`Employee ID.y`), temp$`Employee ID.x`, temp$`Employee ID.y`)
temp$`Age in Days.y`= ifelse(is.na(temp$`Age in Days.y`), temp$`Age in Days.x`, temp$`Age in Days.y`)
temp$`Age in Days.x` = NULL
temp$`Employee ID.x` = NULL
colnames(temp) = colnames(t)
t = temp

We could use coalesce
library(dplyr)
left_join(t, Manual_changes, by = "Case ID") %>%
mutate(Employee_ID.y = coalesce(`Employee ID.x`, `Employee ID.y`),
`Age in Days.y` = coalesce(`Age in Days.x`, `Age in Days.y`))
Or with data.table
library(data.table)
setDT(t)[Manual_changes,
c('Employee ID', 'Age in Days') :=
.(fcoalesce(`Employee ID.x`, `Employee ID.y`),
fcoalesce(`Age in Days.x`, `Age in Days.y`)),
on = .(`Case ID`)]

Related

Overwrite dataframe columns in R

I'm making a dataframe in R.
With this code I'm currently making a dataframe with 1 row.
logs_data_frame <- data.frame(
session_id = 1,
t_minus = NA,
glucose_reading = NA,
activity_type = "test",
duration = "test2",
intensity = "test3",
activity_start = "test4",
bolus_taken = NA,
bolus_3hr = NA,
previous_flow = NA,
new_flow_number = NA,
action_bolus = NA,
action_basaal = NA,
action_carbs = NA,
recommendation_bolus = NA,
recommendation_basaal = NA,
recommendation_carbs = NA,
bolus_1hr = NA,
date = Sys.Date(),
pid = 2
)
Now I'm wondering how to overwrite some values like t_minus and Glucose_reading so that instead of NA they have test 5 and 6 without making another row
Kind regards and thanks for the help!

Joining 'n' number of lists and perform a function in R

I have a dataframe which contains many triplicate (3 columns set). And I have grouped the dataframe into each triplicate as a seperate group of list.
The example dataset is,
example_data <- structure(list(`1_3ng` = c(69648445400, 73518145600, NA, NA,
73529102400, 75481088000, NA, 73545910600, 74473949200, 77396199900
), `2_3ng` = c(71187990600, 70677690400, NA, 73675407400, 73215342700,
NA, NA, 69996254800, 69795686400, 76951318300), `3_3ng` = c(65032022000,
71248214000, NA, 72393058300, 72025550900, 71041067000, 73604692000,
NA, 73324202000, 75969608700), `4_7-5ng` = c(NA, 65845061600,
75009245100, 64021237700, 66960666600, 69055643600, NA, 64899540900,
NA, NA), `5_7-5ng` = c(65097201700, NA, NA, 69032126500, NA,
70189899800, NA, 74143529100, 69299087400, NA), `6_7-5ng` = c(71964413900,
69048485800, NA, 71281569700, 71167596500, NA, NA, 68389822800,
69322289200, NA), `7_10ng` = c(71420403700, 67552276500, 72888076300,
66491357100, NA, 68165019600, 70876631000, NA, 69174190100, 63782945300
), `8_10ng` = c(NA, 71179401200, 68959365100, 70570182700, 73032738800,
NA, 74807496700, NA, 71812102100, 73855098500), `9_10ng` = c(NA,
70403756100, NA, 70277421000, 69887731700, 69818871800, NA, 71353886700,
NA, 74115466700), `10_15ng` = c(NA, NA, 68487581700, NA, NA,
69056997400, NA, 67780479400, 66804467800, 72291939500), `11_15ng` = c(NA,
63599643700, NA, NA, 60752029700, NA, NA, 63403655600, NA, 64548492900
), `12_15ng` = c(NA, 67344750600, 61610182700, 67414425600, 65946654700,
66166118400, NA, 70830837700, 67288305700, 69911451300)), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -10L)
And after grouping I got the four lists, since the above example dataset contains 4 groups. I have used the following R code for grouping the data,
grouping_data<-function(df){ #df= dataframe
df_col<-ncol(df) #calculates no. of columns in dataframe
groups<-sort(rep(0:((df_col/3)-1),3)) #creates user determined groups
id<-list() #creates empty list
for (i in 1:length(unique(groups))){
id[[i]]<-which(groups == unique(groups)[i])} #creates list of groups
names(id)<-paste0("id",unique(groups)) #assigns group based names to the list "id"
data<-list() #creates empty list
for (i in 1:length(id)){
data[[i]]<-df[,id[[i]]]} #creates list of dataframe columns sorted by groups
names(data)<-paste0("data",unique(groups)) #assigns group based names to the list "data"
return(data)}
group_data <-grouping_data(example_data)
Please suggest useful R code for do a particular function for all the lists at a same time.
For example the below function I have done by following way,
#VSN Normalization
vsnNorm <- function(dat) {
dat<-as.data.frame(dat)
vsnNormed <- suppressMessages(vsn::justvsn(as.matrix(dat)))
colnames(vsnNormed) <- colnames(dat)
row.names(vsnNormed) <- rownames(dat)
return(as.matrix(vsnNormed))
}
And I have tried like below,
vsn.dat0 <- vsnNorm(group_data$data0)
vsn.dat1 <- vsnNorm(group_data$data1)
vsn.dat2 <- vsnNorm(group_data$data2)
vsn.dat3 <- vsnNorm(group_data$data3)
vsn.dat <- cbind (vsn.dat0,vsn.dat1,vsn.dat2,vsn.dat3)
It is working well.
But the dataset triplicate (3 columns set) value may be change from dataset to dataset. And calling all the lists everytime become will be tedious.
So kindly share some codes which will call all the resulted lists for performing a function and combine the result as a single file.
Thank you in advance.
The shortcut you are looking for is:
vsn.dat <- do.call("rbind", lapply(group_data, vsnNorm))

How to return the column name that matches a certain condition in R (prefer data.table)?

I have a data set similar to the following format:
Each user has only one variable that is not NA. I want to return the column name of this NA column as follows:
Writing a loop by row may easily solve this problem, but I want to user data.table to generate this variable.
With base R, it would be more efficent
df1$NonNA_VarName <- names(df1)[-1][max.col(!is.na(df1[-1]), 'first')]
df1$NonNA_VarName
#[1] "v1" "v2" "v1" "v3" "v4" "v3"
With data.table, an option is to melt into 'long' format and then extract the 'variable
library(data.table)
melt(setDT(df1), id.var = 'user', na.rm = TRUE)[,
.(NonNA_VarName = first(variable)), user][df1, on = .(user)]
Or another option is to group by 'user' and use which.max to return the index
setDT(df1)[, NonNA_VarName := names(.SD)[which.max(unlist(.SD))], by = user]
data
df1 <- structure(list(user = 1:6, v1 = c(3, NA, 2, NA, NA, NA), v2 = c(NA,
5, NA, NA, NA, NA), v3 = c(NA, NA, NA, 5, NA, 7), v4 = c(NA,
NA, NA, NA, 4, NA)), class = "data.frame", row.names = c(NA,
-6L))

linear regression model with dplyr on sepcified columns by name

I have the following data frame, each row containing four dates ("y") and four measurements ("x"):
df = structure(list(x1 = c(69.772808673525, NA, 53.13125414839,
17.3033274666411,
NA, 38.6120670385487, 57.7229000792707, 40.7654208618078, 38.9010405201831,
65.7108936694177), y1 = c(0.765671296296296, NA, 1.37539351851852,
0.550277777777778, NA, 0.83037037037037, 0.0254398148148148,
0.380671296296296, 1.368125, 2.5250462962963), x2 = c(81.3285388496182,
NA, NA, 44.369872853302, NA, 61.0746827226573, 66.3965114460601,
41.4256874481852, 49.5461413070349, 47.0936997726146), y2 =
c(6.58287037037037,
NA, NA, 9.09377314814815, NA, 7.00127314814815, 6.46597222222222,
6.2462962962963, 6.76976851851852, 8.12449074074074), x3 = c(NA,
60.4976916064608, NA, 45.3575294731303, 45.159758146854, 71.8459173097114,
NA, 37.9485456227131, 44.6307631013742, 52.4523342186143), y3 = c(NA,
12.0026157407407, NA, 13.5601157407407, 16.1213657407407, 15.6431018518519,
NA, 15.8986805555556, 13.1395138888889, 17.9432638888889), x4 = c(NA,
NA, NA, 57.3383407228293, NA, 59.3921356160536, 67.4231673171527,
31.853845252547, NA, NA), y4 = c(NA, NA, NA, 18.258125, NA,
19.6074768518519,
20.9696527777778, 23.7176851851852, NA, NA)), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -10L))
I would like to create an additional column containing the slope of all the y's versus all the x's, for each row (each row is a patient with these 4 measurements).
Here is what I have so far:
df <- df %>% mutate(Slope = lm(vars(starts_with("y") ~
vars(starts_with("x"), data = .)
I am getting an error:
invalid type (list) for variable 'vars(starts_with("y"))'...
What am I doing wrong, and how can I calculate the rowwise slope?
You are using a tidyverse syntax but your data is not tidy...
Maybe you should rearrange your data.frame and rethink the way you store your data.
Here is how to do it in a quick and dirty way (at least if I understood your explanations correctly):
df <- merge(reshape(df[,(1:4)*2-1], dir="long", varying = list(1:4), v.names = "x", idvar = "patient"),
reshape(df[,(1:4)*2], dir="long", varying = list(1:4), v.names = "y", idvar = "patient"))
df$patient <- factor(df$patient)
Then you could loop over the patients, perform a linear regression and get the slopes as a vector:
sapply(levels(df$patient), function(pat) {
coef(lm(y~x,df[df$patient==pat,],na.action = "na.omit"))[2]
})

pivoting data with rownames to be colnames r [duplicate]

This question already has answers here:
Reshaping data.frame from wide to long format
(8 answers)
Closed 5 years ago.
I have the following dataset
structure(list(Year = c("Oranges", "Cherrys", "Apples", "Bananas"
), `42461` = c(0, NA, 12, NA), `42491` = c(1, 12, NA, NA), `42522` = c(1,
12, 7, NA), `42552` = c(NA, 12, 6, NA), `42583` = c(2, NA, 8,
NA), `42614` = c(NA, 12, 5, NA), `42644` = c(NA, NA, 4, NA),
`42675` = c(NA, 12, NA, NA), `42705` = c(NA, 3, NA, NA),
`42736` = c(NA, NA, 12, NA), `42767` = c(NA, NA, 12, NA),
`42795` = c(NA, 12, NA, NA), Total = c(0, 0, 0, 0)), .Names = c("Year",
"42461", "42491", "42522", "42552", "42583", "42614", "42644",
"42675", "42705", "42736", "42767", "42795", "Total"), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -4L))
I would like to pivot it to look like:
Category-Values-Year
I tried the following:
datdat %>% gather(Cat,Var)
but the problem is that the year is the name of each column.
I removed the "Totals" column, I'm not sure if this is what you're asking for:
library (data.table)
dat = data.table (structure(list(Year = c("Oranges", "Cherrys", "Apples",
"Bananas"
), `42461` = c(0, NA, 12, NA), `42491` = c(1, 12, NA, NA), `42522` = c(1,
12, 7, NA), `42552` = c(NA, 12, 6, NA), `42583` = c(2, NA, 8,
NA), `42614` = c(NA, 12, 5, NA), `42644` = c(NA, NA, 4, NA),
`42675` = c(NA, 12, NA, NA), `42705` = c(NA, 3, NA, NA),
`42736` = c(NA, NA, 12, NA), `42767` = c(NA, NA, 12, NA),
`42795` = c(NA, 12, NA, NA), Total = c(0, 0, 0, 0)), .Names = c("Year",
"42461", "42491", "42522", "42552", "42583", "42614", "42644",
"42675", "42705", "42736", "42767", "42795", "Total"), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -4L)))
names (dat)[1] = "Category"
dat [, "Total" := NULL]
melt.dat = melt (dat, id.vars = c("Category"), variable.name = "Year")
melt.dat gives you:
> head (melt.dat)
Category Year value
1: Oranges 42461 0
2: Cherrys 42461 NA
3: Apples 42461 12
4: Bananas 42461 NA
5: Oranges 42491 1
6: Cherrys 42491 12
Also note, the table is a data.table, not a data.frame :)
Forgot to mention, run install.packages ("data.table") if you don't have it yet

Resources