Calculate percentages based on grouping variable - r

I would like to calculate the percentage of people reported doing some work per day. For example I would like to know the percentage of people reported doing some work on Monday from the entire sample.
I used the following code to calculate this, but I am not sure about my result.
df1 <- structure(list(id = c(12L, 123L, 10L), t1_1 = c(0L, 0L, 1L),
t1_2 = c(1L, 0L, 1L), t1_3 = c(1L, 0L, 1L), t2_1 = c(0L,
1L, 1L), t2_2 = c(1L, 1L, 1L), t2_3 = c(0L, 1L, 1L), t3_1 = c(1L,
0L, 1L), t3_2 = c(0L, 0L, 1L), t3_3 = c(1L, 0L, 1L), t4_1 = c(0L,
1L, 1L), t4_2 = c(1L, 1L, 1L), t4_3 = c(0L, 1L, 1L), t5_1 = c(0L,
1L, 1L), t5_2 = c(1L, 1L, 1L), t5_3 = c(0L, 1L, 1L), t6_1 = c(1L,
0L, 1L), t6_2 = c(1L, 0L, 1L), t6_3 = c(1L, 0L, 1L), t7_1 = c(0L,
1L, 1L), t7_2 = c(0L, 1L, 1L), t7_3 = c(1L, 1L, 1L)),
class = "data.frame", row.names = c(NA, -3L))
Variable description t1 - Monday (t1_1, t1_2, t1_3 - are time steps that measured if work was done during Monday); t2 - Tuesday; t3 - Wednesday; t4 - Thursday; t5 - Friday; t6 - Saturda and t7- Sunday; id is an identification number
df2 <- reshape2::melt(df1, id.vars = "id")
df2$variable <- as.character(df2$variable)
df2$day <- sapply(strsplit(df2$variable, "_"), `[`, 1)
df2$day <- factor(df2$day, levels = variable)
df3<-df2 %>%
group_by (day) %>%
mutate (percent = (value/sum(value) *100))
ggplot(df3, aes(day, group = value)) +
geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") +
scale_fill_discrete(name="Days", labels=c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday")) +
scale_y_continuous(labels=scales::percent, limits=c(0,1)) +
ylab("relative frequencies") +
theme_bw()
Result:

library(dplyr)
df1 <- structure(
list(id = c(12L, 123L, 10L),
t1_1 = c(0L, 0L, 1L), t1_2 = c(1L, 0L, 1L), t1_3 = c(1L, 0L, 1L),
t2_1 = c(0L, 1L, 1L), t2_2 = c(1L, 1L, 1L), t2_3 = c(0L, 1L, 1L),
t3_1 = c(1L, 0L, 1L), t3_2 = c(0L, 0L, 1L), t3_3 = c(1L, 0L, 1L),
t4_1 = c(0L, 1L, 1L), t4_2 = c(1L, 1L, 1L), t4_3 = c(0L, 1L, 1L),
t5_1 = c(0L, 1L, 1L), t5_2 = c(1L, 1L, 1L), t5_3 = c(0L, 1L, 1L),
t6_1 = c(1L, 0L, 1L), t6_2 = c(1L, 0L, 1L), t6_3 = c(1L, 0L, 1L),
t7_1 = c(0L, 1L, 1L), t7_2 = c(0L, 1L, 1L), t7_3 = c(1L, 1L, 1L)),
class = "data.frame", row.names = c(NA, -3L))
df2 <- reshape2::melt(df1, id.vars = "id")
df2$variable <- as.character(df2$variable)
df2$day <- sapply(strsplit(df2$variable, "_"), `[`, 1)
df3 <- df2 %>%
group_by(id, day) %>%
summarize(count = sum(value)) %>%
group_by(id) %>%
mutate(percent = count / sum(count)) %>%
arrange(day, id)
> df3
# A tibble: 21 x 4
# Groups: id [3]
id day count percent
<int> <chr> <int> <dbl>
1 10 t1 3 0.143
2 12 t1 2 0.182
3 123 t1 0 0
4 10 t2 3 0.143
5 12 t2 1 0.0909
6 123 t2 3 0.25
...
Is it something you are looking for?

Related

Add a new row on basis of column values in R

I am trying to get my head around this simple preprocessing task in R. I am trying to get the ideal value column as a row titled ideal in Product ID. I think the image below will shed more light on it.
> dput(df)
structure(list(Consumer = c(43L, 43L, 43L, 43L, 43L, 41L, 41L,
41L, 41L, 41L), Product = c(106L, 992L, 366L, 257L, 548L, 106L,
992L, 366L, 257L, 548L), Firm = c(1L, 1L, 1L, 1L, 1L, 0L, 0L,
0L, 0L, 0L), Juicy = c(1L, 1L, 1L, 1L, 1L, 0L, 0L, 0L, 1L, 1L
), Sweet = c(0L, 1L, 0L, 1L, 0L, 0L, 1L, 0L, 0L, 1L), Ideal_Firm = c(1L,
1L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L), Ideal_Juicy = c(1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), Ideal_Sweet = c(1L, 1L, 1L,
1L, 1L, 0L, 0L, 0L, 0L, 0L)), class = "data.frame", row.names = c(NA,
-10L))
Below is a solution:
df <- data.frame(
Consumer = c(rep(43, 5), rep(41, 5)),
Product = rep(sample(100:900,size = 5, replace = F), 2),
Firm = c(sample(rep(0:1, 5), replace = T)),
Juicy = c(sample(rep(0:1, 5), replace = T)),
Sweet = c(sample(rep(0:1, 5), replace = T)),
Ideal_Firm = 1,
Ideal_Juicy = c(rep(1, 5), rep(2, 5)),
Ideal_Sweet = c(rep(1, 5), rep(0, 5))
)
library(dplyr)
df <- merge(
# Bind the observation...
df %>% select(Consumer:Sweet) %>%
pivot_wider(id_cols = Consumer,names_from = Product,values_from = Firm:Sweet),
# ... to the ideal
df %>% group_by(Consumer) %>%
# Here I put mean, but it could be 1, median, min, max... If I understood correctly, it has to be 1?
summarise(across(Ideal_Firm:Ideal_Sweet, ~mean(.x))) %>%
# Rename so the column name has the form [characteristic]_ideal instead of Ideal_[characteristic]
# remove prefix Ideal_ ...
rename_at(.vars = vars(starts_with("Ideal_")),
.funs = funs(sub("Ideal_", "", .))) %>%
# ... add _Ideal as a suffix instead
rename_at(vars(-Consumer), function(x) paste0(x,"_Ideal"))
)
# Then manipulate to get into long form again
df <- df %>% pivot_longer(cols = !Consumer) %>%
separate(name, c("Characteristic", "Product")) %>%
pivot_wider(id_cols = Consumer:Product, names_from = Characteristic, values_from = value)
df

Split data from matrix with names

I have a 42*14 matrix (5 * 14 as the example below). Are there any approaches I can split the data into 42 individual data sets (vector) and at the same time name them from subject 1 to subject 42?
#expected result (I need subject1 to subject5)
subject1 <- structure(list(`1` = 0L, `2` = 0L, `3` = 1L, `4` = 1L, `5` = 0L,
`6` = 0L, `7` = 1L, `8` = 1L, `9` = 0L, `10` = 1L, `11` = 1L,
`12` = 0L, `13` = 1L, `14` = 0L), row.names = c(NA, -1L), class = c("data.table",
"data.frame"))
structure(list(`1` = c(0L, 1L, 0L, 1L, 0L), `2` = c(0L, 1L, 1L,
0L, 1L), `3` = c(1L, 0L, 0L, 0L, 1L), `4` = c(1L, 1L, 1L, 1L,
0L), `5` = c(0L, 0L, 0L, 0L, 0L), `6` = c(0L, 0L, 1L, 1L, 1L),
`7` = c(1L, 1L, 0L, 1L, 1L), `8` = c(1L, 0L, 1L, 0L, 1L),
`9` = c(0L, 0L, 1L, 0L, 0L), `10` = c(1L, 1L, 0L, 1L, 1L),
`11` = c(1L, 1L, 1L, 0L, 0L), `12` = c(0L, 0L, 1L, 1L, 0L
), `13` = c(1L, 0L, 0L, 0L, 1L), `14` = c(0L, NA, NA, 1L,
0L)), row.names = c(NA, -5L), class = c("data.table", "data.frame"
))
You can use list2env assuming your matrix is named x.
list2env(setNames(asplit(x, 1),
paste0("subject", seq_len(nrow(x)))), globalenv())
ls()
#[1] "subject1" "subject2" "subject3" "subject4" "subject5" "x"
subject1
# 1 2 3 4 5 6 7 8 9 10 11 12 13 14
# 0 0 1 1 0 0 1 1 0 1 1 0 1 0
In case you want to remove also NA:
list2env(setNames(lapply(asplit(x, 1), Filter, f = Negate(is.na)),
paste0("subject", seq_len(nrow(x)))), globalenv())
subject2
# 1 2 3 4 5 6 7 8 9 10 11 12 13
# 1 1 0 1 0 0 1 0 0 1 1 0 0
You want each row as one element in the list, right?
Might be prettier solutions out there, but this would do the trick:
l <- lapply(
1:nrow(your.data),
function(i){
j <- !is.na(your.data[i,])
return( your.data[i,which(j)] )
}
)
names(l) <- paste0( "subject", 1:length(l) )

How to make loop for one-at-a time logistic regression in R?

I want to do logistic regressions for several (n = 30) SNPs (coded as 0,1,2) as predictors and a casecontrol variable (0,1) as an outcome. As few of those rs are correlated, I cannot put all rs# in one model but have to run one at a time regression for each i.e., I cannot simply plus them together in one model like rs1 + rs2 + rs3 and so on....I need to have each regressed separately like below;
test1 = glm(casecontrol ~ rs1, data = mydata, family=binomial)
test2 = glm(casecontrol ~ rs2, data = mydata, family=binomial)
test3 = glm(casecontrol ~ rs3, data = mydata, family=binomial)
While I can run all the above regressions separately, is there a way to loop them together so I could get a summary() of all tests in one go?
I will have to adjust for age and sex too, but that would come after I run an unadjusted loop.
My data head from dput(head(mydata)) for example;
structure(list(ID = 1:6, sex = c(2L, 1L, 2L, 1L, 1L, 1L), age = c(52.4725405022036,
58.4303618001286, 44.5300065923948, 61.4786037395243, 67.851808819687,
39.7451378498226), bmi = c(31.4068751083687, 32.0614937413484,
23.205021363683, 29.1445372393355, 32.6287483051419, 20.5887741968036
), casecontrol = c(0L, 1L, 0L, 1L, 1L, 1L), rs1 = c(1L, 0L, 2L,
2L, 1L, 2L), rs2 = c(2L, 1L, 2L, 0L, 1L, 1L), rs3 = c(1L, 0L,
1L, 2L, 2L, 2L), rs4 = c(1L, 1L, 1L, 1L, 0L, 2L), rs5 = c(1L,
0L, 0L, 0L, 1L, 2L), rs6 = c(1L, 1L, 1L, 1L, 1L, 2L), rs7 = c(0L,
0L, 0L, 0L, 0L, 0L), rs8 = c(0L, 0L, 1L, 0L, 2L, 1L), rs9 = c(0L,
0L, 2L, 1L, 1L, 0L), rs10 = c(2L, 0L, 0L, 2L, 2L, 1L), rs11 = c(0L,
1L, 1L, 0L, 1L, 1L), rs12 = c(1L, 2L, 0L, 1L, 2L, 2L), rs13 = c(0L,
2L, 0L, 0L, 0L, 0L), rs14 = c(1L, 1L, 1L, 1L, 2L, 2L), rs15 = c(1L,
2L, 1L, 1L, 0L, 1L), rs16 = c(0L, 2L, 1L, 2L, 2L, 1L), rs17 = c(0L,
2L, 1L, 1L, 2L, 2L), rs18 = c(1L, 2L, 2L, 1L, 1L, 1L), rs19 = c(1L,
1L, 0L, 1L, 2L, 2L), rs20 = c(2L, 1L, 0L, 2L, 2L, 1L), rs21 = c(1L,
2L, 2L, 1L, 1L, 0L), rs22 = c(1L, 1L, 2L, 2L, 0L, 1L), rs23 = c(2L,
0L, 2L, 1L, 1L, 1L), rs24 = c(0L, 0L, 0L, 2L, 2L, 2L), rs25 = c(2L,
2L, 1L, 1L, 0L, 1L), rs26 = c(1L, 1L, 0L, 2L, 0L, 1L), rs27 = c(1L,
1L, 1L, 1L, 0L, 1L), rs28 = c(0L, 1L, 1L, 2L, 0L, 2L), rs29 = c(2L,
2L, 2L, 2L, 1L, 2L), rs30 = c(0L, 2L, 1L, 2L, 1L, 0L)), row.names = c(NA,
6L), class = "data.frame")```
Probably you want something like this:
lapply(1:30, function(i) glm(as.formula(paste0('casecontrol ~ ', 'rs', i)), data = mydata, family = binomial))
which will execute 30 logistic regressions with the selected predictor.
Instead of hard coding the overall number of predictors, you can use:
sum(grepl('rs', names(mydata))), which will return 30.
You can use tidy function from broom package to get the summary in a tidy format.
purrr::map_dfr(1:30, function(i) data.frame(model = i, tidy(glm(as.formula(paste0('casecontrol ~ ', 'rs', i)), data = mydata, family = binomial))))
or you can do this in a more dynamic way:
names(mydata)[grepl('rs', names(mydata))] -> pred #get all predictors that contain 'rs'
purrr::map_dfr(1:length(pred),
function(i) data.frame(model = i,
tidy(glm(as.formula(paste0('casecontrol ~ ', pred[i])), data = mydata, family = binomial))))
If you want to include another variable, you simply need to adjust the pred vector.
c(pred, paste0(pred, ' + age')) -> pred #interaction between rs drivers and age
or
c(pred, paste0(pred, ' + age + sex')) -> pred #interaction between rs drivers age and sex
you can do something like this
outcome<-mydata %>% select("casecontrol") #select outcome
features <- mydata %>% select("rs1":"rs30") #select features
features_names<-data.frame(colnames(features)) #get feature names
for(i in 1:nrow(features_names)) # loop over length of features_name
{selected=features[,i,drop=FALSE] #get each feature
total <- cbind(selected, response) # combine with outcome
glm.fit <- glm(casecontrol ~ ., data = total, family = binomial("logit"))
summary(glm.fit)
}

Turning a Presence/Absence Matrix into a Cluster Analysis in R Studio

Okay, I have a presence/absence matrix of 6 samples with 25 possibilities of presence/absence.
I've been able to make a cluster dendrogram with the data, but I'd rather have it plotted as a distance matrix that looks better and is easier to analysis? (Maybe a cluster plot or something similar?)
I'm really stuck with figuring out the next part - I've spent days searching on here and various other Google searches but nothing is turning up!
Here's the code I've got for the cluster dendrogram:
matrix<-read.csv("Horizontal.csv")
distance<-dist(matrix)
hc.m<-hclust(distance)
plot(hc.m, labels=matrix$Sample, main ="", cex.main=0.8, cex.lab= 1.1)
Help!
> dput(head(matrix,20))structure(list(Sample = structure(1:6, .Label = c("CL1", "CL2",
"CL3", "COL1", "COL2", "COL3"), class = "factor"), X = c(0L,
0L, 0L, 1L, 1L, 1L), X.1 = c(1L, 0L, 0L, 1L, 1L, 1L), X.2 = c(1L,
1L, 1L, 0L, 0L, 0L), X.3 = c(1L, 1L, 1L, 1L, 1L, 1L), X.4 = c(1L,
1L, 1L, 0L, 0L, 0L), X.5 = c(0L, 0L, 0L, 1L, 1L, 0L), X.6 = c(1L,
1L, 1L, 1L, 1L, 1L), X.7 = c(1L, 1L, 1L, 1L, 1L, 1L), X.8 = c(0L,
0L, 0L, 1L, 1L, 1L), X.9 = c(0L, 0L, 0L, 1L, 1L, 1L), X.10 = c(1L,
1L, 1L, 1L, 1L, 1L), X.11 = c(1L, 1L, 1L, 1L, 1L, 1L), X.12 = c(1L,
1L, 1L, 1L, 1L, 1L), X.13 = c(1L, 0L, 0L, 0L, 0L, 0L), X.14 = c(0L,
0L, 0L, 1L, 1L, 1L), X.15 = c(0L, 0L, 0L, 1L, 1L, 1L), X.16 = c(1L,
1L, 1L, 1L, 0L, 0L), X.17 = c(1L, 1L, 1L, 1L, 1L, 1L), X.18 = c(1L,
1L, 1L, 1L, 1L, 1L), X.19 = c(1L, 1L, 1L, 1L, 1L, 1L), X.20 = c(1L,
1L, 1L, 1L, 1L, 1L), X.21 = c(1L, 1L, 1L, 1L, 0L, 0L), X.22 = c(0L,
0L, 0L, 0L, 1L, 1L), X.23 = c(1L, 1L, 1L, 1L, 1L, 1L), X.24 = c(0L,
1L, 1L, 1L, 1L, 1L)), .Names = c("Sample", "X", "X.1", "X.2",
"X.3", "X.4", "X.5", "X.6", "X.7", "X.8", "X.9", "X.10", "X.11",
"X.12", "X.13", "X.14", "X.15", "X.16", "X.17", "X.18", "X.19",
"X.20", "X.21", "X.22", "X.23", "X.24"), row.names = c(NA, 6L
), class = "data.frame")
Okay with this code:
library(vegan)
library(ggplot2)
library(tidyverse)
library(MASS)
#set working directory
setwd("~/Documents/Masters/BS707/Metagenomics")
#read csv file
cookie<-read.csv("Horizontal.csv")
data.frame(cookie, row.names = c("CL1", "CL2", "CL3", "COL1", "COL2", "COL3"))
df = subset(cookie)
data.frame(df, row.names = c("CL1", "CL2", "CL3", "COL1", "COL2", "COL3"))
dm<- dist(df, method = "binary") #calculate the distance matrix
cmdscale(dm, eig = TRUE, k=2) -> mds
as.tibble(mds$points) #mds coordinates
bind_cols(df, Sample = df$Sample) #bind sample names
mutate(df,group = gsub("\\d$", "", "Sample1"))#remove last digit from sample names to form groups
ggplot(df)+
geom_point (aes(x = "V1",y = "V2", color = "group")) #plot
as.tibble(mds$points) %>% ggplot() + geom_point (aes(x = V1, y = V2))
I get the plot but each group is named 'Sample' rather than CL1, CL2, CL3, COL1, COL2, COL3. I had to remove the %>% because my R didn't recognise it as a command or anything and gave an error every single time (switched to + or deleted and then it worked fine).
Here is a way to visualize your data in 2 dimensions:
library(tidyverse)
df %>%
dplyr::select(-1) %>% #remove first column
dist(method = "binary") %>% #calculate the distance matrix
cmdscale(eig = TRUE, k = 2) -> mds #do MDS also known as principal coordinates analysis
as.tibble(mds$points) %>% #mds coordinates
bind_cols( Sample = df$Sample) %>% #bind sample names
mutate(group = gsub("\\d$", "", Sample)) %>% #remove last digit from sample names to form groups
ggplot()+
geom_point(aes(x = V1,y = V2, color = group)) #plot
or without tidyverse:
df_dist <- dist(df[,-1], method = "binary")
mds <- cmdscale(df_dist, eig = TRUE, k = 2)
for_plot <- data.frame(mds$points, group = gsub("\\d$", "", df$Sample))
ggplot(for_plot)+
geom_point(aes(x = X1,y = X2, color = group))
other options include using isoMDS from MASS library which will perform Kruskal's Non-metric Multidimensional Scaling or metaMDS from vegan library which performs Nonmetric Multidimensional Scaling with Stable Solution from Random Starts, Axis Scaling and Species Scores.

impute missing values using minimum of a class in R

I'm new to R and need help with imputing missing values in one of the columns in a dataset that I'm currently working on. The below image shows the missing value I want to impute along with few of the columns.
I want to fill in the value with the minimum qty for a customer using its previous entries as I think this best fits my situation and data. For example, in the image I should be able to fill in the missing value with 1 (min of 1,5,2).
During my search, I mainly came across methods that use mean for a given class, and not minimum or maximum.
Any help or pointers would really be appreciated.
Edit: Here is the output from dput.
structure(list(YEAR = c(2011L, 2012L, 2014L, 2015L, 2011L, 2012L
), CustomerId = c("00000063", "00000063", "00000063", "00000063",
"00000065", "00000065"), MemberType = structure(c(2L, 2L, 2L,
2L, 2L, 2L), .Label = c("GROUP", "INDIVIDUAL", "PARTNER"), class = "factor"),
MembershipTypeCode = structure(c(6L, 6L, 6L, 10L, 6L, 6L), .Label = c("EGROUP",
"EINDIV", "EINDIV2", "EPARTNER", "GROUP", "INDIV", "INDIV2",
"INDIV3", "PARTNER", "PLUS", "PLUS2", "PLUS20", "PLUS3",
"PLUSENTERPRI", "PLUSGROUP", "PLUSGROUP2", "PROF_ENTERPR",
"PROF_GROUP", "PROF_GROUP2", "PROF_INDIV", "PROF_INDIV2",
"PROF_INDIV3"), class = "factor"), MembershipPeriodBegin = structure(c(15279,
15677, 16071, 16436, 15006, 15371), class = "Date"), MembershipPeriodEnd = structure(c(15644,
16070, 16435, 16800, 15370, 15736), class = "Date"), ConsecutiveYearsAsMember = c(14L,
15L, 17L, 18L, 8L, 9L), AllocationUsage = c(0, 0, 0, 0, 0,
0), SetCOPPreference = structure(c(2L, 2L, 2L, 2L, 2L, 2L
), .Label = c("Y", "N"), class = "factor"), Purchase.Qty = c(2L,
5L, 1L, NA, 7L, 27L), Webcast.Registration = c(0L, 0L, 0L,
0L, 0L, 1L), Web.Visits = c(0L, 0L, 42L, 0L, 0L, 0L), Web.Page.Views = c(0L,
0L, 98L, 0L, 0L, 0L), Blog.Visits = c(0L, 0L, 3L, 0L, 0L,
0L), Blog.Page.Views = c(0L, 0L, 4L, 0L, 0L, 0L), Forum.Visits = c(0L,
0L, 45L, 0L, 0L, 0L), Forum.Page.Views = c(0L, 0L, 102L,
0L, 0L, 0L), ParatureTickets = c(0L, 0L, 0L, 0L, 0L, 0L),
ParatureChats = c(0L, 0L, 0L, 0L, 0L, 0L), Registered.for.Edu = c(0L,
0L, 0L, 0L, 0L, 0L), Attended.ICE = structure(c(2L, 2L, 2L,
2L, 2L, 2L), .Label = c("Y", "N"), class = "factor"), Attended.TK = structure(c(2L,
2L, 2L, 2L, 2L, 2L), .Label = c("Y", "N"), class = "factor"),
Frugal = structure(c(2L, 2L, 2L, 2L, 2L, 2L), .Label = c("Y",
"N"), class = "factor"), Chapter.Board = structure(c(2L,
2L, 2L, 2L, 2L, 2L), .Label = c("Y", "N"), class = "factor"),
Retained = structure(c(5L, 5L, 5L, 1L, 5L, 5L), .Label = c("Active",
"Awaiting Renewal", "Future Dated", "Lost", "Retained"), class = "factor"),
ProfileCompletion = c(60, 60, 60, 60, 60, 60), NumberofLogins = c(1L,
1L, 15L, 0L, 0L, 4L), Downloads = c(NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_), ForumMember = structure(c(NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_
), .Label = "N", class = "factor"), FreeUpgrade = structure(c(1L,
1L, 1L, 1L, 1L, 1L), .Label = c("Y", "N"), class = "factor")), .Names = c("YEAR",
"CustomerId", "MemberType", "MembershipTypeCode", "MembershipPeriodBegin",
"MembershipPeriodEnd", "ConsecutiveYearsAsMember", "AllocationUsage",
"SetCOPPreference", "Purchase.Qty", "Webcast.Registration", "Web.Visits",
"Web.Page.Views", "Blog.Visits", "Blog.Page.Views", "Forum.Visits",
"Forum.Page.Views", "ParatureTickets", "ParatureChats", "Registered.for.Edu",
"Attended.ICE", "Attended.TK", "Frugal", "Chapter.Board", "Retained",
"ProfileCompletion", "NumberofLogins", "Downloads", "ForumMember",
"FreeUpgrade"), row.names = c(NA, 6L), class = "data.frame")
Thanks,
Pratik
We can use na.aggregate with FUN= min. We convert the 'data.frame' to 'data.table' (setDT(df1)), grouped by 'CustomerID', we apply na.aggregate on 'PurchaseQty' and assign (:=) the output back to the 'PurchaseQty'.
library(data.table)
library(zoo)
setDT(df1)[, PurchaseQty := na.aggregate(PurchaseQty, FUN= min) , by = CustomerID]
data
df1 <- data.frame(CustomerID= rep(1:2, each=4), PurchaseQty= c(4, 3, NA, 3, 1, 9, NA, 4))
Since you provide no data, here a toy example how I would do it in base R:
# simple sample data
data <- data.frame( a = rep( 10:12, each = 4 ), b = 12:1 )
data[ c( 3, 5, 12 ), 2 ] <- NA
# for each unique a value, get the row index with the min b value,
# and write that min value to col b where b is NA
for( i in unique( data$a ) )
data[ which( is.na( data$b ) & data$a == i ), "b" ] <-
min( data[ data$a == i, "b" ], na.rm = TRUE )
data
a b
1 10 12
2 10 11
3 10 9
4 10 9
5 11 5
6 11 7
7 11 6
8 11 5
9 12 4
10 12 3
11 12 2
12 12 2

Resources