Recoding data from 1,1,1, to 1,2,3 - r

So I have this dataframe. Under the column potential_child, I want to recode the values so that the oldest child == 1, the second oldest == 2, third oldest == 3, etc. I have the ages of the children, but I am floundering how to do this exactly.
DHS1 <- structure(list(person_id = c(1, 2, 1, 2, 3, 4, 1, 7, 1, 2), household_id = c(1,1, 6, 6, 6, 6, 7, 63342, 63344, 63344), year = c(2018, 2018,2018, 2018, 2018, 2018, 2018, 2018, 2018, 2018), month = c(1,1, 1, 1, 1, 1, 1, 12, 12, 12), sex = c(2, 1, 1, 2, 1, 2, 1, 1,1, 2), age = c(28, 28, 44, 37, 10, 10, 60, 65, 55, 55), potential_mom = c(1,NA, NA, 1, NA, NA, NA, NA, NA, 1), potential_child = c(NA, NA,NA, NA, 1, 1, NA, NA, NA, NA), momloc = c(0, 0, 0, 0, 2, 2, 0,0, 0, 0), num_child = c(0, 0, 0, 0, 1, 1, 0, 0, 0, 0)), row.names = c(NA,-10L), class = c("tbl_df", "tbl", "data.frame"))
Me trying to think it through (apologies in advance for this ugly rambling):
mutate(potential_child2 = if potential_child == 1 & age =<)

We can arrange the data based on household_id and age and for each household_id get the cumulative sum of potential_child value after replacing NA with 0.
library(dplyr)
DHS1 %>%
arrange(household_id, age) %>%
group_by(household_id) %>%
#Or if you also want to do it for every person
#group_by(person_id, household_id) %>%
mutate(potential_child = cumsum(replace(potential_child,
is.na(potential_child), 0)),
potential_child = replace(potential_child, potential_child == 0, NA))

Related

How to find make one table of proportions of demographic variables in R

I'm new to R and am having trouble with a simple command. How do I find the proportion of demographic variables (for example, proportion of English speakers in my population, or proportion of White respondents)?
I'd like to create a large table with all of the proportions, and would hopefull include mean age and median education level, but am having trouble finding the command. This is what I've tried:
table2 <- table(VR_Data$English)
prop.table(table2)
table3 <- table(VR_Data$race)
prop.table(table3)
table4 <- table(VR_Data$male)
prop.table(table4)
If it helps, this is my data:
structure(list(study = c(4, 4, 4, 1, 1, 1), TREATMENT = c(0,
0, 0, 0, 0, 0), TREATMENT4 = c(0, 0, 0, 0, 0, 0), TREATMENT2 = c(0,
0, 0, 0, 0, 0), TREATMENT3 = c(0, 0, 0, 0, 0, 0), order = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), id = c(279,
238, 239, 135, 143, 138), treatment = c(0, 0, 0, 0, 0, 0), treatment_condition = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), control_condition = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), m_check1 = c(1,
1, 1, 1, 1, 1), relationship = c(NA, NA, NA, 7, 6, 5), payment = c(NA,
NA, NA, 10, 3, 3), educ_level = c(14, 14, 12, 16, 16, 18), golf = c(3,
5, 3, 3, 2, 3), male = c(1, 0, 1, 0, 0, 1), Asian = c(0, 1, 0,
0, 0, 0), Black = c(0, 0, 0, 0, 0, 0), Latino = c(1, 0, 0, 0,
0, 0), White = c(0, 0, 1, 1, 1, 1), age = c(27, 53, 49, 25, 28,
24), English = c(1, 1, 1, 1, 1, 1), education = c(16, 16, 14,
14, 14, 16), enjoy = c(4, 1, 3.5, 4.25, 3.25, 3.5), RELATIONSHIP = c(4.33333349227905,
1, 4.33333349227905, 3.66666674613953, 3.5, 3.66666674613953),
anxiety = c(3, 3.40000009536743, 2.20000004768372, 1.25,
2, 1.25), BEH_SIM = c(3, 1, 3.75, 2.75, 2.5, 1.75), sptconf = c(3.33333325386047,
1.5, 4, 4.83333349227905, 4, 3.66666674613953), NEG_EFFICACY = c(4,
1.16666662693024, 3.66666674613953, 4.83333349227905, 4.16666650772095,
4.5), spteffort = c(3.16666674613953, 3.5, 4.16666650772095,
3.16666674613953, 3.16666674613953, 3.5), SPTEFFORT_OTHER = c(3.16666674613953,
3.5, 3.5, 3.16666674613953, 3, 3.33333325386047), SIM_VALUES = c(3.75,
1, 3.75, 3.75, 1.5, 2.25), COOP_MOTIV = c(2.33333325386047,
3, 2.66666674613953, 5, 2.5, 2.66666674613953), COMP_MOTIV = c(5,
5, 3.20000004768372, 4.40000009536743, 2.40000009536743,
4.40000009536743), presence = c(NA, NA, NA, 2.79999995231628,
1.79999995231628, 2.59999990463257), environ = c(NA, NA,
NA, 3, 4, 3), openresponse = c(NA, NA, NA, 94.25, 86, 60),
TotalOwnerCommission = c(300, 266.666656494141, 258.333343505859,
266.666656494141, 383.333343505859, 325), TotalRangerComm = c(258.333343505859,
233.33332824707, 291.666656494141, 258.333343505859, 175,
166.66667175293), TotalComm = c(279.166687011719, 250, 275,
262.5, 279.166687011719, 245.833343505859), merge = c(1,
1, 1, 0, 0, 0), Control = c(1, 1, 1, NA, NA, NA), treatment_Shoes = c(0,
0, 0, NA, NA, NA), treatment_Instructions_Only = c(0, 0,
0, NA, NA, NA), treatment_Info_Only = c(0, 0, 0, NA, NA,
NA), treatment_Info_Instructions = c(0, 0, 0, NA, NA, NA),
group = c("OwnerOnly", "OwnerOnly", "OwnerOnly", "", "",
""), race = c(4, 2, 5, NA, NA, NA), race_a = c("", "", "",
"", "", ""), RELATIONSHIP_2 = c(9.02055358886719, 1, 9.02055358886719,
7.02113246917725, 6.54790019989014, 7.02113246917725), TotalOwnerCommission_2 = c(5196.15234375,
4354.64794921875, 4152.12744140625, 4354.64794921875, 7505.24560546875,
5859.02099609375)), row.names = c(NA, -6L), class = c("tbl_df",
"tbl", "data.frame"))
How can I put all of these proportions into one table, with mean and medians? Is this possible? Thank you so much in advance.
If I understand your question correctly, this should help you.
library(dplyr)
VR_Data %>%
summarize(English_prop = sum(English) / n(),
White_prop = sum(White) / n(),
male_prop = sum(male) / n(),
age_avg = mean(age),
education_avg = mean(education))
Should give you this...
# A tibble: 1 x 5
English_prop White_prop male_prop age_avg education_avg
<dbl> <dbl> <dbl> <dbl> <dbl>
1 1 0.667 0.5 34.3 15

Create table using kable in R

This is the code that I used (with a lot of help from the StackOverflow communitiy!) to create a simpler table using the same data:
library(here)
ANOVA_Relationship_Subset_sum <- ANOVA_Relationship_Subset %>%
dplyr::group_by(treatment) %>%
dplyr::summarize(
n=n(),
mean=mean(TotalComm),
`std. dev` = sd(TotalComm)
)
ANOVA_Relationship_Subset_sum
Now I'm on to something a little more complicated; how can I create a table like this:
If it helps, this is my data:
structure(list(study = c(4, 4, 4, 1, 1, 1), TREATMENT = c(0,
0, 0, 0, 0, 0), TREATMENT4 = c(0, 0, 0, 0, 0, 0), TREATMENT2 = c(0,
0, 0, 0, 0, 0), TREATMENT3 = c(0, 0, 0, 0, 0, 0), order = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), id = c(279,
238, 239, 135, 143, 138), treatment = c(0, 0, 0, 0, 0, 0), treatment_condition = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), control_condition = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), m_check1 = c(1,
1, 1, 1, 1, 1), relationship = c(NA, NA, NA, 7, 6, 5), payment = c(NA,
NA, NA, 10, 3, 3), educ_level = c(14, 14, 12, 16, 16, 18), golf = c(3,
5, 3, 3, 2, 3), male = c(1, 0, 1, 0, 0, 1), Asian = c(0, 1, 0,
0, 0, 0), Black = c(0, 0, 0, 0, 0, 0), Latino = c(1, 0, 0, 0,
0, 0), White = c(0, 0, 1, 1, 1, 1), age = c(27, 53, 49, 25, 28,
24), English = c(1, 1, 1, 1, 1, 1), education = c(16, 16, 14,
14, 14, 16), enjoy = c(4, 1, 3.5, 4.25, 3.25, 3.5), RELATIONSHIP = c(4.33333349227905,
1, 4.33333349227905, 3.66666674613953, 3.5, 3.66666674613953),
anxiety = c(3, 3.40000009536743, 2.20000004768372, 1.25,
2, 1.25), BEH_SIM = c(3, 1, 3.75, 2.75, 2.5, 1.75), sptconf = c(3.33333325386047,
1.5, 4, 4.83333349227905, 4, 3.66666674613953), NEG_EFFICACY = c(4,
1.16666662693024, 3.66666674613953, 4.83333349227905, 4.16666650772095,
4.5), spteffort = c(3.16666674613953, 3.5, 4.16666650772095,
3.16666674613953, 3.16666674613953, 3.5), SPTEFFORT_OTHER = c(3.16666674613953,
3.5, 3.5, 3.16666674613953, 3, 3.33333325386047), SIM_VALUES = c(3.75,
1, 3.75, 3.75, 1.5, 2.25), COOP_MOTIV = c(2.33333325386047,
3, 2.66666674613953, 5, 2.5, 2.66666674613953), COMP_MOTIV = c(5,
5, 3.20000004768372, 4.40000009536743, 2.40000009536743,
4.40000009536743), presence = c(NA, NA, NA, 2.79999995231628,
1.79999995231628, 2.59999990463257), environ = c(NA, NA,
NA, 3, 4, 3), openresponse = c(NA, NA, NA, 94.25, 86, 60),
TotalOwnerCommission = c(300, 266.666656494141, 258.333343505859,
266.666656494141, 383.333343505859, 325), TotalRangerComm = c(258.333343505859,
233.33332824707, 291.666656494141, 258.333343505859, 175,
166.66667175293), TotalComm = c(279.166687011719, 250, 275,
262.5, 279.166687011719, 245.833343505859), merge = c(1,
1, 1, 0, 0, 0), Control = c(1, 1, 1, NA, NA, NA), treatment_Shoes = c(0,
0, 0, NA, NA, NA), treatment_Instructions_Only = c(0, 0,
0, NA, NA, NA), treatment_Info_Only = c(0, 0, 0, NA, NA,
NA), treatment_Info_Instructions = c(0, 0, 0, NA, NA, NA),
group = c("OwnerOnly", "OwnerOnly", "OwnerOnly", "", "",
""), race = c(4, 2, 5, NA, NA, NA), race_a = c("", "", "",
"", "", ""), RELATIONSHIP_2 = c(9.02055358886719, 1, 9.02055358886719,
7.02113246917725, 6.54790019989014, 7.02113246917725), TotalOwnerCommission_2 = c(5196.15234375,
4354.64794921875, 4152.12744140625, 4354.64794921875, 7505.24560546875,
5859.02099609375)), row.names = c(NA, -6L), class = c("tbl_df",
"tbl", "data.frame"))
Briefly, I do want to thank the SO community for all their help with R. I don't know how I would have gotten this far without all of your help.
Try the apaTables Package! Format your data as per the example, and use the apa.aov.table() function to transform your table to APA style.

How to convert a wide data into long format for cross-classified model [R, GLMM]

I would like to convert wide data to long data in R, and my data set is for cross-classified models, exploring participants’ response to each target item that has different characteristics.
condition is one of the two conditions where participants were
assigned to.
The participants were tested twice: t1 and t2.
As for item-level predictor variables, x1 and x2, are coded.
As for response, whether participants’ response to the item was right or wrong was coded.
two test formats were administered, test1 and test2.
Although there are so many tutorials for a wide to long conversion, I could not find a one specifically explaining conversion for cross-classified models.
I would like to use tidyverse if possible for the sake of consistency.
My sample data is the following:
structure(list(item_name = c("x1", "x2", "participant_id", "1",
"2", "3", "4", "5", "6", "7"), participant_variable_1 = c(NA,
NA, NA, 20, 23, 21, 20, 19, 22, 30), condition = c(NA, NA, NA,
"A", "B", "A", "B", "A", "B", "A"), t1.item1.test1 = c(1, 3,
NA, 0, 1, 0, 1, 0, 0, 1), t1.item2.test1 = c(2, 2, NA, 0, 0,
0, 1, 1, 0, 1), t1.item3.test1 = c(1, 3, NA, 0, 0, 0, 1, 0, 0,
0), t1.item4.test1 = c(3, 1, NA, 1, 0, 0, 0, 1, 1, 0), t2.item1.test1 = c(1,
3, NA, 0, 1, 1, 0, 1, 1, 1), t2.item2.test1 = c(2, 2, NA, 1,
0, 1, 0, 1, 0, 1), t2.item3.test1 = c(1, 3, NA, 0, 0, 0, 1, 0,
0, 0), t2.item4.test1 = c(3, 1, NA, 1, 1, 0, 1, 1, 1, 0), t1.item1.test2 = c(1,
3, NA, 0, 1, 0, 1, 0, 0, 1), t1.item2.test2 = c(2, 2, NA, 0,
0, 0, 1, 1, 0, 1), t1.item3.test2 = c(1, 3, NA, 0, 0, 0, 1, 0,
0, 0), t1.item4.test2 = c(3, 1, NA, 1, 0, 0, 0, 1, 1, 0), t2.item1.test2 = c(1,
3, NA, 0, 1, 1, 0, 1, 1, 1), t2.item2.test2 = c(2, 2, NA, 1,
0, 1, 0, 1, 0, 1), t2.item3.test2 = c(1, 3, NA, 0, 0, 0, 1, 0,
0, 0), t2.item4.test2 = c(3, 1, NA, 1, 1, 0, 1, 1, 1, 0)), row.names = c(NA,
-10L), class = c("tbl_df", "tbl", "data.frame"))
I would like to have a long data, which looks like the following:
Please and thank you for your guidance!
This answer requires heavy use of the new pivot_ functions in the dev version of tidyr. You can install that with devtools::install_github("tidyverse/tidyr") if you're willing to run the dev version.
First we split the data into item and participant info - you're not really getting any benefit from storing both in the same table:
item_info = dat[1:2, ]
participant_info = dat[4:nrow(dat), ] %>%
rename(participant_id = item_name)
Then it's time for a lot of pivoting:
# I have the dev version of tidyr so that is being loaded
library(tidyverse)
item_long = item_info %>%
select(-participant_variable_1, -condition) %>%
pivot_longer(
cols = t1.item1:t2.item4,
names_to = c("time", "item"),
names_pattern = "t(\\d)\\.(item\\d)",
) %>%
pivot_wider(names_from = item_name, values_from = value)
participant_long = participant_info %>%
pivot_longer(
cols = t1.item1:t2.item4,
names_to = c("time", "item"),
names_pattern = "t(\\d)\\.(item\\d)",
values_to = "response"
)
combined = participant_long %>%
left_join(item_long, by = c("item", "time"))
Result:
> combined
# A tibble: 56 x 8
participant_id participant_variable_1 condition time item response x1 x2
<chr> <dbl> <chr> <chr> <chr> <dbl> <dbl> <dbl>
1 1 20 A 1 item1 0 1 3
2 1 20 A 1 item2 0 2 2
3 1 20 A 1 item3 0 1 3
4 1 20 A 1 item4 1 3 1

How to overlay survival plot

I would to overlay two different survival curves on same plot, for example OS et PFS (here false results).
N pt. OS. OS_Time_(years). PFS. PFS_Time_(years).
__________________________________________________________________
1. 1 12 0 12
2. 0 10 1 8
3. 0 14 0 14
4. 0 10 0 10
5. 1 11 1 8
6. 1 16 1 6
7. 0 11 1 4
8. 0 12 1 10
9. 1 9 0 9
10 1 10 1 9
__________________________________________________________
First, I import my dataset:
library(readxl)
testR <- read_excel("~/test.xlsx")
View(testR)
Then, I created survfit for both OS and PFS:
OS<-survfit(Surv(OS_t,OS)~1, data=test)
PFS<-survfit(Surv(PFS_t,PFS)~1, data=test)
And finally, I can plot each one thanks to:
plot(OS)
plot(PFS)
for example (or ggplot2...).
Here my question, if I want to overlay the 2 ones on same graph, how can I do?
I tried multipleplot or
ggplot(testR, aes(x)) + # basic graphical object
geom_line(aes(y=y1), colour="red") + # first layer
geom_line(aes(y=y2), colour="green") # second layer
But it didn't work (but I'm not sure to use it correctly).
Can someone help me, please ?
Thanks a lot
Here is my code for Data sample:
test <- structure(list(ID = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 1, 2, 3, 4, 5, 6, 7, 8, 9),
Sex = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1),
Tabac = c(2, 0, 1, 1, 0, 0, 2, 0, 0, 0, 1, 1, 1, 0, 2, 0, 1, 1, 1),
Bmi = c(20, 37, 37, 25, 28, 38, 16, 27, 26, 28, 15, 36, 20, 17, 28, 37, 27, 26, 18),
Age = c(75, 56, 45, 65, 76, 34, 87, 43, 67, 90, 56, 37, 84, 45, 80, 87, 90, 65, 23), c(0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0),
OS_times = c(2, 4, 4, 2, 3, 5, 5, 3, 2, 2, 4, 1, 3, 2, 4, 3, 4, 3, 2),
OS = c(0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0),
PFS_time = c(1, 2, 1, 1, 3, 4, 3, 1, 2, 2, 4, 1, 2, 2, 2, 3, 4, 3, 2),
PFS = c(1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0)),
.Names = c("ID", "Sex", "Tabac", "Bmi", "Age", "LN", "OS_times", "OS", "PFS_time", "PFS"),
class = c("tbl_df", "tbl", "data.frame"),
row.names = c(NA, -19L))
You may use the ggsurv function from the GGally package in the following way. Combine both groups of variables in a data frame and add a "type" column. Later in the call to the plot, you refer to the type.
I used your data structure and named it "test". Afterwards, I transformed it to a data frame with the name "testdf".
library(GGally)
testdf <- data.frame(test)
OS_PFS1 <- data.frame(life = testdf$OS, life_times = testdf$OS_times, type= "OS")
OS_PFS2 <- data.frame(life = testdf$PFS, life_times = testdf$PFS_time, type= "PFS")
OS_PFS <- rbind(OS_PFS1, OS_PFS2)
sf.OS_PFS <- survfit(Surv(life_times, life) ~ type, data = OS_PFS)
ggsurv(sf.OS_PFS)
if you want the confidence intervals shown:
ggsurv(sf.OS_PFS, CI = TRUE)
Please let me know whether this is what you want.

Preparing data before doing Principal component analysis (PCA)

I have a data frame(200x300) which consists of mixed(character,numeric) variables and has lots of missing values(NA)
my first problem is how to convert all data into numeric, I can use factors but there are like 100 columns to convert.
secondly, all my columns are not expressed in equivalent units.
I just want some good advice for preparing the data before starting with my analysis
following is the structure of the data
structure(list(Hormonal.cycle.status..P4. = c(1, 1, 4, 1, 4,
1), Hormonal.medication.status = c(2, 1, 1, 2, 1, 1), Hormonal.medication.type = c(21,
27, 27, 26, 27, 27), ID.pathologist.main = c(3, 3, 3, 4, 2, 1
), ID.pathologist.sub = c(2, 1, 2, 2, 2, 2), Day.of.the.cycle_calculated = c(10,
8, 22, 19, 19, 12), Cycle.status..histology.and.cycle.day. = c(12,
18, 9, 1, 18, 3), Cycle.status.final..P4..histology..cycle.day. = c(1,
4, 5, 1, 6, 3), Deep.lesion = c(2, 2, 1, 2, 1, 2), Ovarian.lesion = c(2,
2, 2, 1, 2, 2), Peritoneal.lesion = c(2, 2, 2, 2, 2, 2), Combination.of.lesions = c(1,
1, 7, 4, 7, 1), DEEP.all.types = c(2, 2, NA, 2, NA, 2), DEEP.uterosacral = c(2,
2, NA, 2, NA, 2), DEEP.RVE = c(1, 1, 1, 2, 1, 1), DEEP.bowel = c(1,
1, 1, 1, 1, 1), DEEP.bladder = c(1, 1, 1, 1, 1, 1), Ovarian.endo.cyst = c(5,
5, 3, 1, 3, 5), Peritoneal = c(2, NA, 2, NA, 2, 2), Peritoneal.size.total = c(3,
3, 3, 3, 3, 2), Date.of.the.surgery = c(96, 98, 17, 105, 107,
108), Type.of.surgery = c(1, 1, 1, 1, 1, 1), Perit..surface = c(1,
2, 3, 3, 2, 2), Perit..deep = c(3, NA, NA, NA, 3, 3), R.ovary.surface = c(NA,
1, 2, 1, 1, NA), R.ovary.deep = c(NA, NA, 2, NA, 3, NA), L.ovary.surface = c(2,
NA, 2, NA, 1, NA), L.ovary.deep = c(2, 4, 4, NA, 4, 4), F.d.block = c(NA,
NA, 1, 1, NA, 1), R.ovary.frail = c(NA, NA, 3, NA, NA, 3), R.ovary.tight = c(NA,
NA, NA, NA, 3, NA), L.ovary.frail = c(NA, NA, NA, 2, NA, NA),
L.ovary.tight = c(2, 2, 3, NA, 2, 2), R.tuba.frail = c(NA,
NA, NA, NA, NA, 2), R.tuba.tight = c(NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_), L.tuba.frail = c(NA,
NA, NA, NA, NA, 2), L.tuba.tight = c(4, NA, NA, NA, 4, NA
), Elsewhere = c(35, NA, NA, 24, NA, NA), Other.diseases = c(16,
NA, NA, NA, NA, 11), In.microarray = c(2, 2, 1, 2, 2, 1),
In.cytokine.plexes = c(2, 2, 2, 2, 2, 2)), .Names = c("Hormonal.cycle.status..P4.",
"Hormonal.medication.status", "Hormonal.medication.type", "ID.pathologist.main",
"ID.pathologist.sub", "Day.of.the.cycle_calculated", "Cycle.status..histology.and.cycle.day.",
"Cycle.status.final..P4..histology..cycle.day.", "Deep.lesion",
"Ovarian.lesion", "Peritoneal.lesion", "Combination.of.lesions",
"DEEP.all.types", "DEEP.uterosacral", "DEEP.RVE", "DEEP.bowel",
"DEEP.bladder", "Ovarian.endo.cyst", "Peritoneal", "Peritoneal.size.total",
"Date.of.the.surgery", "Type.of.surgery", "Perit..surface", "Perit..deep",
"R.ovary.surface", "R.ovary.deep", "L.ovary.surface", "L.ovary.deep",
"F.d.block", "R.ovary.frail", "R.ovary.tight", "L.ovary.frail",
"L.ovary.tight", "R.tuba.frail", "R.tuba.tight", "L.tuba.frail",
"L.tuba.tight", "Elsewhere", "Other.diseases", "In.microarray",
"In.cytokine.plexes"), row.names = c("H003", "H004", "H006",
"H007", "H008", "H011"), class = "data.frame")

Resources