Extracting time information from a raw dataset in R - r

I would like to extract ID, item information and time information from an unstructured dataset. Here is my sample dataset looks like:
df <- data.frame(Text_1 = c("Scoring", "1 = Incorrect","Text1","Text2","Text3","Text4", "Demo 1: Color Naming","Amarillo","Azul","Verde","Azul",
"Demo 1: Errors","Item 1: Color naming","Amarillo","Azul","Verde","Azul",
"Item 1: Time in seconds","Item 1: Errors",
"Item 2: Shape Naming","Cuadrado/Cuadro","Cuadrado/Cuadro","Círculo","Estrella","Círculo","Triángulo",
"Item 2: Time in seconds","Item 2: Errors"),
School.2 = c("Teacher:","DC Name:","Date (mm/dd/yyyy):","Child Grade:","Student Study ID:",NA, NA,NA,NA,NA,NA,
0,"1 = Incorrect responses",0,1,NA,NA,NA,0,"1 = Incorrect responses",0,NA,NA,1,1,0,NA,0),
X_Elementary_School..3 = c("Bill:","X District","10/7/21","K","123-2222-2:",NA, NA,NA,NA,NA,NA,
NA,"Child response",NA,NA,NA,NA,NA,NA,"Child response",NA,NA,NA,NA,NA,NA,NA,NA),
School.4 = c("Teacher:","DC Name:","Date (mm/dd/yyyy):","Child Grade:","Student Study ID:",NA, 0,NA,1,NA,NA,0,"1 = Incorrect responses",0,1,NA,NA,120,0,"1 = Incorrect responses",NA,1,0,1,NA,1,110,0),
Y_Elementary_School..2 = c("John:","X District","11/7/21","K","112-1111-3:",NA, NA,NA,NA,NA,NA,NA,"Child response",NA,NA,NA,NA,NA,NA,"Child response",NA,NA,NA,NA,NA,NA, NA,NA))
> df
Text_1 School.2 X_Elementary_School..3 School.4 Y_Elementary_School..2
1 Scoring Teacher: Bill: Teacher: John:
2 1 = Incorrect DC Name: X District DC Name: X District
3 Text1 Date (mm/dd/yyyy): 10/7/21 Date (mm/dd/yyyy): 11/7/21
4 Text2 Child Grade: K Child Grade: K
5 Text3 Student Study ID: 123-2222-2: Student Study ID: 112-1111-3:
6 Text4 <NA> <NA> <NA> <NA>
7 Demo 1: Color Naming <NA> <NA> 0 <NA>
8 Amarillo <NA> <NA> <NA> <NA>
9 Azul <NA> <NA> 1 <NA>
10 Verde <NA> <NA> <NA> <NA>
11 Azul <NA> <NA> <NA> <NA>
12 Demo 1: Errors 0 <NA> 0 <NA>
13 Item 1: Color naming 1 = Incorrect responses Child response 1 = Incorrect responses Child response
14 Amarillo 0 <NA> 0 <NA>
15 Azul 1 <NA> 1 <NA>
16 Verde <NA> <NA> <NA> <NA>
17 Azul <NA> <NA> <NA> <NA>
18 Item 1: Time in seconds <NA> <NA> 120 <NA>
19 Item 1: Errors 0 <NA> 0 <NA>
20 Item 2: Shape Naming 1 = Incorrect responses Child response 1 = Incorrect responses Child response
21 Cuadrado/Cuadro 0 <NA> <NA> <NA>
22 Cuadrado/Cuadro <NA> <NA> 1 <NA>
23 Círculo <NA> <NA> 0 <NA>
24 Estrella 1 <NA> 1 <NA>
25 Círculo 1 <NA> <NA> <NA>
26 Triángulo 0 <NA> 1 <NA>
27 Item 2: Time in seconds <NA> <NA> 110 <NA>
28 Item 2: Errors 0 <NA> 0 <NA>
Here, When the first column has encounters Item #: Time in seconds, I would like to keep corresponding value under School.2 and School.4 columns. So the first students has those Item 1 and Item 2 time information as empty cell so they are NA. The second student has those time information as 120 and 110. These two students have two items in the example dataset.
There are multiple columns for the real dataset so I need something to be in a generalized loop.
My desired output would be:
> time
id itemid time
1 123-2222-2 Item 1 NA
2 123-2222-2 Item 2 NA
3 112-1111-3 Item 1 120
4 112-1111-3 Item 2 110
This is my attempt but I could not add the id yet.
time.data <- df %>%
filter(str_detect(Text_1, 'Time in seconds')) # %>%
# select(time = 4)
select_time_cols <- seq(from = 2, to = ncol(time.data), by = 2)
time <- time.data %>%
select(time = select_time_cols)
time.t<-as.data.frame(t(time))
rownames(time.t)<-seq(1,nrow(time.t),1)
colnames(time.t)<-paste0('i',seq(1,ncol(time.t),1))
time.t<-apply(time.t,2,as.numeric)
time.t<-as.data.frame(time.t)
> time.t
item1 item2
1 NA NA
2 120 110

I came up with a solution with your test dataset, though this isn't particularly robust or elegant, so there will likely be better options out there. The key thing I did was shift the text3 row over, so that the id and time were in the same column. I put notes in the code to illuminate my steps. Hopefully this points you in the right direction or prompts those with better coding skills than my own!
library(dplyr)
library(tidyr)
df2<-df%>% #Turning all to character, so that I can move the row without interfering with factor level
mutate_all(as.character)
df2[5, 2:(ncol(df2) - 1)] <- df2[5, 3:ncol(df2)] #shifting row 5, which is the "Text3" that has studentID to be the same column as the times
df3<-df2%>%
filter(grepl("Text3|Time in seconds", Text_1))%>% #removing unnecessary rows
mutate(type = case_when(grepl("Text", Text_1) ~ "itemid", #Relabling the Text_1 column
grepl("Item 1", Text_1) ~ "1",
grepl("Item 2", Text_1) ~ "2"))%>%
select(grep("type|^School", names(.))) #only keeping needed columns
colnames(df3) <- df3[1,] #Taking the first row and making it the column names
df3 <- df3[-1, ] #removing row 1, since it was made into column names
df3%>%
tidyr::pivot_longer(-itemid, names_to = "id", values_to = "time")%>% #Making the data into longer format
select(id, itemid, time)%>% #relocating columns to match desired output
arrange(desc(id)) #sorting to match desired output

Related

Converting a list of tab- and newline-delimited character vectors into a dataframe in R

I have an object res which is a list of tab- and newline-delimited character vectors.
>dput(res)
list("# BLASTP 2.11.0+\n# Query: tr|A4I9M8|A4I9M8_LEIIN
Hypothetical_protein_-_conserved OS=Leishmania infantum OX=5671
GN=LINF_340011400 PE=4 SV=1\n# RID: HT7Z4V6H016\n# Database: nr\n#
Fields: subject id, evalue, % query coverage per subject, % identity\n#
35 hits found\nref|XP_001468447.1|\t8.15e-
82\t100\t100.000\nref|XP_001686149.1|\t1.33e-
73\t100\t90.984\ndbj|GET92147.1|\t4.31e-
67\t99\t82.645\ngb|KAG5468145.1|\t2.87e-
66\t100\t82.787\nref|XP_003878644.1|\t1.96e-
64\t100\t88.525\n# BLASTP 2.11.0+\n# Query: tr|A4HT95|A4HT95_LEIIN
Centrin_-_putative OS=Leishmania infantum OX=5671 GN=LINF_070012700
PE=3 SV=1\n# RID: HT7Z4V6H016\n# Database: nr\n# Fields: subject id,
evalue, % query coverage per subject, % identity\n# 512 hits
found\nref|XP_001463286.1|\t1.53e-
132\t100\t100.000\nref|XP_001680978.1|\t9.57e-
125\t100\t95.187\nref|XP_003872218.1|\t2.29e-
121\t100\t93.048\ndbj|GET86075.1|\t1.03e-
114\t97\t89.071\ngb|KAG5510937.1|\t1.50e-
103\t100\t80.214\ngb|KAG5486865.1|\t2.99e-103\t100\t84.492\n# BLAST
processed 2 queries\n",
"")
This is the structure of res
>str(res)
List of 2
$ : chr "# BLASTP 2.11.0+\n# Query: tr|A4I9M8|A4I9M8_LEIIN Hypothetical_protein_-_conserved
OS=Leishmania infantum OX=56"| __truncated__
$ : chr ""
How do I convert res into a dataframe in R? The dataframe should look like this:
This is horrible, but...
# remove the artificial newlines from sci notation
res_clean <- gsub("e- \n", "e-", res[[1]])
# split on newline -- each element is a row
res_split <- strsplit(res_clean[[1]], split = "\n")[[1]]
# get number of variables to add
num_vars <- max(sapply(strsplit(res_split, "\t"), length))
# create empty data frame with correct number of variables
dat <- read.csv(text = "", col.names = paste0("v", 1:num_vars))
# loop over rows to add
for (i in seq_along(res_split)) {
# get row vector
new_row <- strsplit(res_split[i], split = "\t")[[1]]
# make sure row vector is correct length by appending with NAs
new_row <- append(new_row, values = rep(NA, num_vars - length(new_row)), after = length(new_row))
# add to data
dat[i, ] <- new_row
}
> head(dat, 20)
v1 v2 v3 v4
#1 # BLASTP 2.11.0+ <NA> <NA> <NA>
#2 # Query: tr|A4I9M8|A4I9M8_LEIIN <NA> <NA> <NA>
#3 Hypothetical_protein_-_conserved OS=Leishmania infantum OX=5671 <NA> <NA> <NA>
#4 GN=LINF_340011400 PE=4 SV=1 <NA> <NA> <NA>
#5 # RID: HT7Z4V6H016 <NA> <NA> <NA>
#6 # Database: nr <NA> <NA> <NA>
#7 # <NA> <NA> <NA>
#8 Fields: subject id, evalue, % query coverage per subject, % identity <NA> <NA> <NA>
#9 # <NA> <NA> <NA>
#10 35 hits found <NA> <NA> <NA>
#11 ref|XP_001468447.1| 8.15e- 82 100 100.000
#12 ref|XP_001686149.1| 1.33e- 73 100 90.984
#13 dbj|GET92147.1| 4.31e- 67 99 82.645
#14 gb|KAG5468145.1| 2.87e- 66 100 82.787
#15 ref|XP_003878644.1| 1.96e- 64 100 88.525
#16 # BLASTP 2.11.0+ <NA> <NA> <NA>
#17 # Query: tr|A4HT95|A4HT95_LEIIN <NA> <NA> <NA>
#18 Centrin_-_putative OS=Leishmania infantum OX=5671 GN=LINF_070012700 <NA> <NA> <NA>
#19 PE=3 SV=1 <NA> <NA> <NA>
#20 # RID: HT7Z4V6H016 <NA> <NA> <NA>

converting as factor for a list of data frames

I am trying to create a custom function to give labels to modified list of data frames. For example, I have a data frame like below.
df<-data.frame(
gender = c(1,2,1,2,1,2,1,2,2,2,2,1,1,2,2,2,2,1,1,1,1,1,2,1,2,1,2,2,2,1,2,1,2,1,2,1,2,2,2),
country = c(3,3,1,2,5,4,4,4,4,3,3,4,3,4,2,1,4,2,3,4,4,4,3,1,2,1,5,5,4,3,1,4,5,2,3,4,5,1,4),
Q1=c(1,1,NA,NA,NA,NA,NA,NA,1,NA,NA,NA,NA,NA,NA,NA,NA,1,NA,NA,NA,1,1,1,NA,1,1,NA,NA,NA,NA,1,NA,NA,NA,NA,1,NA,1),
Q2=c(1,1,1,1,1,NA,NA,NA,NA,1,1,1,1,1,NA,NA,NA,1,1,1,NA,1,1,1,1,1,NA,NA,NA,1,1,1,1,1,1,1,NA,NA,NA),
Q3=c(1,1,NA,NA,NA,NA,NA,1,NA,NA,NA,NA,NA,NA,NA,NA,1,NA,NA,NA,NA,NA,1,1,1,NA,NA,NA,1,NA,NA,1,1,1,1,1,NA,NA,1),
Q4=c(1,NA,NA,NA,NA,NA,NA,NA,NA,NA,1,NA,NA,NA,NA,NA,NA,NA,1,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA),
Q5=c(1,2,1,1,1,2,1,2,2,1,2,NA,1,1,2,2,2,1,1,1,2,NA,2,1,1,1,2,2,2,NA,1,2,2,1,1,1,2,2,2)
)
I understand your goal to be the following: You want to take a list of data frames (ldat). For each of the dataframes in the list (df, df2) you want to take some existing columns (Q1, Q2, Q3) and replicate them with new names in the same data frame (Q1_new, Q2_new, Q3_new). This you could achieve like this:
variables = c("Q1","Q2","Q3")
new_label =c("Q1_new","Q2_new","Q3_new")
newdfs <- lapply(ldat, FUN = function(x) {
x[,new_label] = x[,variables]
return(x)})
head(newdfs$ALL)
gender country Q1 Q2 Q3 Q4 Q5 cc2 Q1_new Q2_new Q3_new
1 Male USA Yes Available Partner Depends on sales Local 1 Yes Available Partner
2 female USA Yes Available Partner <NA> Overseas NA Yes Available Partner
3 Male CAN <NA> Available <NA> <NA> Local 1 <NA> Available <NA>
4 female EU <NA> Available <NA> <NA> Local 1 <NA> Available <NA>
5 Male UK <NA> Available <NA> <NA> Local 1 <NA> Available <NA>
6 female BR <NA> <NA> <NA> <NA> Overseas NA <NA> <NA> <NA>
Is this what you had in mind?

Merging data with data.table roll="nearest" rolls matches across the entire DF

I have two tables, a sample table and a message table. In the message table, messages are recorded outside the sampling rate of the tracker. What I have been doing is using data.table roll nearest to match the sample message time to the closest value in the sample report. Instead of returning the sample message to the nearest time and NAs for everything else, it seems that it is rolling messages until the next message.
library(data.table)
remotes::install.github("dmirman/gazer") # to get the data
library((gazer)
samp <- system.file("extdata", "TJ_samp.xls", package = "gazer")
samp <- data.table::fread(samp, stringsAsFactors = FALSE) # reads in large datasets
msg <- system.file("extdata", "TJ_msg.xls", package = "gazer")
msg <- data.table::fread(msg, stringsAsFactors = FALSE) # reads in large datasets
setDT(samp)
setDT(msg)
DT_mesg <- msg[samp, on="time", roll="nearest"] # use this to get close to values in sample report
DT_mesg
#SR edfs are a nightmare. This makes it so messages are alined with closest values
get_msg <- DT_mesg %>%
group_by(trial, message) %>%
top_n(n=1, wt=desc(time)) # there are a lot of useless messages and they occupy the same time stamp. Only take the first message in time. This was one way I tried to deal with the multiple message issue, but it does not return messages close to their actual time.
get_msg
I was able to figure it out...
setDT(samp)
setDT(msg)
DT_mesg <- msg[samp, on="time", roll=4]
This gives me my desired result:
trial time message i.trial x y pup Label
1: 1 3314705 !MODE RECORD CR 250 2 1 L\n 1 958.8 580.8 4043 <NA>
2: NA 3314709 <NA> 1 959.1 576.2 4052 <NA>
3: NA 3314713 <NA> 1 959.8 575.6 4053 <NA>
4: NA 3314717 <NA> 1 960.6 575.2 4056 <NA>
5: NA 3314721 <NA> 1 960.2 579.6 4049 <NA>
Not sure why roll="nearest" returns this:
trial time message i.trial x y pup Label
1: 1 3314705 !MODE RECORD CR 250 2 1 L\n 1 958.8 580.8 4043 <NA>
2: 1 3314709 !MODE RECORD CR 250 2 1 L\n 1 959.1 576.2 4052 <NA>
3: 1 3314713 !MODE RECORD CR 250 2 1 L\n 1 959.8 575.6 4053 <NA>
4: 1 3314717 !MODE RECORD CR 250 2 1 L\n 1 960.6 575.2 4056 <NA>
5: 1 3314721 !MODE RECORD CR 250 2 1 L\n 1 960.2 579.6 4049 <NA>

How do I infill non-adjacent rows with sample data from previous rows in R?

I have data containing a unique identifier, a category, and a description.
Below is a toy dataset.
prjnumber <- c(1,2,3,4,5,6,7,8,9,10)
category <- c("based","trill","lit","cold",NA,"epic", NA,NA,NA,NA)
description <- c("skip class",
"dunk on brayden",
"record deal",
"fame and fortune",
NA,
"female attention",
NA,NA,NA,NA)
toy.df <- data.frame(prjnumber, category, description)
> toy.df
prjnumber category description
1 1 based skip class
2 2 trill dunk on brayden
3 3 lit record deal
4 4 cold fame and fortune
5 5 <NA> <NA>
6 6 epic female attention
7 7 <NA> <NA>
8 8 <NA> <NA>
9 9 <NA> <NA>
10 10 <NA> <NA>
I want to randomly sample the 'category' and 'description' columns from rows that have been filled in to use as infill for rows with missing data.
The final data frame would be complete and would only rely on the initial 5 rows which contain data. The solution would preserve between-column correlation.
An expected output would be:
> toy.df
prjnumber category description
1 1 based skip class
2 2 trill dunk on brayden
3 3 lit record deal
4 4 cold fame and fortune
5 5 lit record deal
6 6 epic female attention
7 7 based skip class
8 8 based skip class
9 9 lit record deal
10 10 trill dunk on brayden
complete = na.omit(toy.df)
toy.df[is.na(toy.df$category), c("category", "description")] =
complete[sample(1:nrow(complete), size = sum(is.na(toy.df$category)), replace = TRUE),
c("category", "description")]
toy.df
# prjnumber category description
# 1 1 based skip class
# 2 2 trill dunk on brayden
# 3 3 lit record deal
# 4 4 cold fame and fortune
# 5 5 lit record deal
# 6 6 epic female attention
# 7 7 cold fame and fortune
# 8 8 based skip class
# 9 9 epic female attention
# 10 10 epic female attention
Though it would seem a little more straightforward if you didn't start with the unique identifiers filled out for the NA rows...
You could try
library(dplyr)
toy.df %>%
mutate_each(funs(replace(., is.na(.), sample(.[!is.na(.)]))), 2:3)
Based on new information, we may need a numeric index to use in the funs.
toy.df %>%
mutate(indx= replace(row_number(), is.na(category),
sample(row_number()[!is.na(category)], replace=TRUE))) %>%
mutate_each(funs(.[indx]), 2:3) %>%
select(-indx)
Using Base R to fill in a single field a at a time, use something like (not preserving the correlation between the fields):
fields <- c('category','description')
for(field in fields){
missings <- is.na(toy.df[[field]])
toy.df[[field]][missings] <- sample(toy.df[[field]][!missings],sum(missings),T)
}
and to fill them in simultaneously (preserving the correlation between the fields) use something like:
missings <- apply(toy.df[,fields],
1,
function(x)any(is.na(x)))
toy.df[missings,fields] <- toy.df[!missings,fields][sample(sum(!missings),
sum(missings),
T),]
and of course, to avoid the implicit for loop in the apply(x,1,fun), you could use:
rowAny <- function(x) rowSums(x) > 0
missings <- rowAny(toy.df[,fields])

Replace NA with mode from categorical dataset R

I have a dataset with categorical and NA observations of 10 variables. I want to replace the NA values of each column with the mode. I did a histogram of each variable for identifying the density for each observation and got the mode. I know what values to replace the NAs in each column with.
I saw there was a related post, but I already know what values to replace. Here's the link: Replace mean or mode for missing values in R
Here's to reproduce the dataset:
> #Create data with missing values
> set.seed(1)
> dat <- data.frame(x=sample(letters[1:3],20,TRUE), y=rnorm(20),
stringsAsFactors=FALSE)
> dat[c(5,10,15),1] <- NA
Here's an example:
> #The head of the first five observations
> head(SmallStoredf, n=5)
Age Gender HouseholdIncome MaritalStatus PresenceofChildren HomeOwnerStatus HomeMarketValue
1 <NA> Male <NA> <NA> <NA> <NA> <NA>
2 45-54 Female <NA> <NA> <NA> <NA> <NA>
5 45-54 Female 75k-100k Married Yes Own 150k-200k
6 25-34 Male 75k-100k Married No Own 300k-350k
7 35-44 Female 125k-150k Married Yes Own 250k-300k
Occupation Education LengthofResidence
1 <NA> <NA> <NA>
2 <NA> <NA> <NA>
5 <NA> Completed High School 9 Years
6 <NA> Completed High School 11-15 years
7 <NA> Completed High School 2 Years
In this example, I want NAs in HomeOwnerStatus replaced with Own, HomeMarketValue with 350K-500K, and Occupation with Professional.
EDIT: I tried inputting the values in, but got an error about three of the columns.
> replacementVals <- c(Age = "45-54", Gender = "Male", HouseholdIncome = "50K-75K",
+ MaritalStatus = "Single", PresenceofChildren = "No",
+ HomeOwnerStatus = "Own", HomeMarketValue = "350K-500K",
+ Occupation = "Professional", Education = "Completed High School",
+ LengthofResidence = "11-15yrs")
> indx1 <- replacementVals[col(df2)][is.na(df2[,names(replacementVals)])]
> df2[is.na(df2[,names(replacementVals)])] <- indx1
#Warning messages:
#1: In `[<-.factor`(`*tmp*`, thisvar, value = c("50K-75K", "50K-75K", :
invalid factor level, NA generated
#2: In `[<-.factor`(`*tmp*`, thisvar, value = c("350K-500K", "350K-500K", :
invalid factor level, NA generated
#3: In `[<-.factor`(`*tmp*`, thisvar, value = c("11-15yrs", "11-15yrs", :
invalid factor level, NA generated
Here's the output:
> head(SmallStoredf)
Age Gender HouseholdIncome MaritalStatus PresenceofChildren HomeOwnerStatus HomeMarketValue
1 45-54 Male <NA> Single No Own <NA>
2 45-54 Female <NA> Single No Own <NA>
5 45-54 Female 75k-100k Married Yes Own 150k-200k
6 25-34 Male 75k-100k Married No Own 300k-350k
7 35-44 Female 125k-150k Married Yes Own 250k-300k
8 55-64 Male 75k-100k Married No Own 150k-200k
Occupation Education LengthofResidence
1 Professional Completed High School <NA>
2 Professional Completed High School <NA>
5 Professional Completed High School 9 Years
6 Professional Completed High School 11-15 years
7 Professional Completed High School 2 Years
8 Professional Completed High School 16-19 years
Only NA values in some columns were replaced.
I amended your reproducible example a little bit, here's the setup
> #Create data with missing values
> set.seed(1)
> dat <- data.frame(x=sample(letters[1:3],20,TRUE), y=rnorm(20),
stringsAsFactors=FALSE)
> dat[c(5,10,15),1] <- NA
> dat[6,1]<-NA
#output
# x y
#1 a 1.511781168450847978590
#2 b 0.389843236411431093291
#3 b -0.621240580541803755210
#4 c -2.214699887177499881830
#5 <NA> 1.124930918143108193874
#6 c NA
#7 c -0.016190263098946087311
#8 b 0.943836210685299215051
#9 b 0.821221195098088552200
#10 <NA> 0.593901321217508826322
#11 a 0.918977371608218240873
#12 a 0.782136300731067102276
#13 c 0.074564983365190601328
#14 b -1.989351695863372793127
#15 <NA> 0.619825747894710232799
#16 b -0.056128739529000784558
#17 c -0.155795506705329295238
#18 c -1.470752383899274429169
#19 b -0.478150055108620353206
#20 c 0.417941560199702411005
now define your replacement vals, labeled by the columns you want to have NAs replaced
replacementVals<-c(x="Xreplace", y="Yreplace")
and the next call can replace them in all in one shot
dat[is.na(dat[,names(replacementVals)])]<-replacementVals
# x y
#1 a 1.51178116845085
#2 b 0.389843236411431
#3 b -0.621240580541804
#4 c -2.2146998871775
#5 Xreplace 1.12493091814311
#6 c Yreplace
#7 c -0.0161902630989461
#8 b 0.943836210685299
#9 b 0.821221195098089
#10 Yreplace 0.593901321217509
#11 a 0.918977371608218
#12 a 0.782136300731067
#13 c 0.0745649833651906
#14 b -1.98935169586337
#15 Xreplace 0.61982574789471
#16 b -0.0561287395290008
#17 c -0.155795506705329
#18 c -1.47075238389927
#19 b -0.47815005510862
#20 c 0.417941560199702
But as akrun pointed out, and subsequently solved, this didn't map well to your second data frame example. This is just taken straight from the comments they made (so either way they should probably get the check on this question)
We'll do the setup, I'm not going to do all the prints except for the result
HomeOwnerStatus = c(NA,NA,NA ,"Rent", "Rent" )
HomeMarketValue = c(NA,NA,NA, "350k", "350k")
Occupation = c(NA,NA,NA, NA, NA)
SmallStoreddf<-data.frame(HomeOwnerStatus,HomeMarketValue,Occupation, stringsAsFactors=FALSE)
replacementVals<-c("HomeOwnerStatus" = "Rent", "HomeMarketValue"="350k", "Occupation"="Professional")
Then in two steps (which could be combined into one really long line) you go
#get the values that we will be replacing
indx1<-replacementVals[col(SmallStoreddf)][is.na(SmallStoreddf[, names(replacementVals)])]
#do the replacement
SmallStoreddf[is.na(SmallStoredf[,names(replacementVals)])] <-indx1
# HomeOwnerStatus HomeMarketValue Occupation
#1 Own 350k Professional
#2 Own 350k Professional
#3 Own 350k Professional
#4 Rent 350k Professional
#5 Rent 350k Professional
Try: (Using your second example as it was a bit unclear when you showed two datasets)
indx <- which(is.na(SmallStoredf), arr.ind=TRUE)
SmallStoredf[indx] <- c("Own", "350K-500K", "Professional")[indx[,2]]
SmallStoredf
# HomeOwnerStatus HomeMarketValue Occupation
#1 Own 350K-500K Professional
#2 Own 350K-500K Professional
#3 Own 350K-500K Professional
#4 Rent 350k-500k Professional
#5 Rent 500k-1mm Professional
Upgrading comment.
If you are wanting to replace the missing data with the most frequent category, there may be an equal count of categories within a variable. So in the code below, the replacements are randomly sampled from the categories that are most frequent.
# some example data with missing
set.seed(1)
dat <- data.frame(x=sample(letters[1:3],20,TRUE),
y=sample(letters[1:3],20,TRUE),
w=rnorm(20),
z=sample(letters[1:3],20,TRUE),
stringsAsFactors=FALSE)
dat[c(5,10,15),1] <- NA
dat[c(3,7),2] <- NA
# function to get replacement for missing
# sample is used to randomly select categories, allowing for the case
# when the maximum frequency is shared by more than one category
f <- function(x) {
tab <- table(x)
l <- sum(is.na(x))
sample(names(tab)[tab==max(tab)], l, TRUE)
}
# as we are using sample, set.seed before replacing
set.seed(1)
for(i in 1:ncol(dat)){
if(!is.numeric(dat[i]))
dat[i][is.na(dat[i])] <- f(dat[i])
}
gentle warning: you should think carefully before imputing missing data this way. For example, income is often more likely to be missing for highest and lowest categories. By this method you may be imputing an average wage incorrectly. You should consider why each variable is missing and if it is reasonable to assume the data is MCAR or MAR. If so, i would then consider a more robust method of imputation (mice package).

Resources