Related
I want to compare the Date field of two dataframes and add only the latest records from the second one. The first dataframe has the latest records. These records are updated daily from site. The second one reads the records from a csv file that I saved from the previous day.
data I read from the internet:
df_new<-structure(list(DCounter = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L),
CCounter = c(125L, 36L, 22L, 17L, 11L, 8L, 4L, 20L, 8L, 3L),
RCounter = c(24L, 33L, 34L, 50L, 33L, 21L, 62L, 10L, 20L, 31L),
CrCounter = c(1L, 1L, 8L, 2L, 2L, 8L, 2L, 3L, 0L, 1L),
Date = c("20/03/2020", "19/03/2020", "18/03/2020", "17/03/2020", "16/03/2020", "15/03/2020", "14/03/2020", "13/03/2020", "12/03/2020","11/03/2020")),
class = "data.frame", row.names = c(NA, 10L))
Format the date field to be Date type and rename field
df_new$Date = as.Date(df_new$Date, format = "%d/%m/%y")
colnames(df_new)<-c("D","C","R","Cr","Date")
#old data- read from csv file has data from yesterday
#----------------------
#df_old <- read.csv("df_Saved.csv",header=T)
df_old<-structure(list(D = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L),
C = c(6L, 12L, 7L, 11L, 8L, 4L, 20L, 8L, 3L, 4L, 1L, 3L, 3L, 0L, 2L, 0L, 0L, 10L, 1L, 0L, 2L, 17L, 15L, 6L, 5L),
R = c(3L,3L, 0L, 3L, 2L, 2L, 0L, 0L, 3L, 0L, 1L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L),
Cr = c(1L, 0L, 0L, 0L, 0L, 2L, 0L, 0L, 1L, 1L, 0L, 1L, 0L, 0L, 0L, 2L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L),
Date = structure(c(17L, 16L, 15L, 14L, 13L, 12L, 11L, 10L, 9L, 8L, 25L, 24L, 23L, 22L, 21L, 20L, 19L, 18L, 7L, 6L, 5L, 4L, 3L, 2L, 1L),
.Label = c("2/24/2020", "2/25/2020", "2/26/2020", "2/27/2020",
"2/28/2020", "2/29/2020", "3/1/2020", "3/10/2020", "3/11/2020", "3/12/2020",
"3/13/2020", "3/14/2020", "3/15/2020", "3/16/20", "3/17/20", "3/18/20",
"3/19/20", "3/2/2020", "3/3/2020", "3/4/2020", "3/5/2020", "3/6/2020", "3/7/2020", "3/8/2020", "3/9/2020"), class = "factor")),
class = "data.frame", row.names = c(NA, -25L))
Get today's date and format it
#--------------
dateToAdd<-format(Sys.time(), "%Y/%m/%d")
#extract ONLY updated dates
df_newExtracted<- with(df_new, df_new[(Date >= dateToAdd), ])
if(df_old$Date[1]< df_newExtracted$Date[1] ){
df_final<-rbind(df_newExtracted,df_old)
cat("Add New records\n")
}else{
df_final<-df_old
cat("Nothing new \n")
}
df_final
write.csv(df_final, "df_Saved.csv", row.names=FALSE)
I couldn't figure out the root cause of the problem, sometimes if the difference in the date one day, it works and sometimes the difference 2 days , it's not working. Sometimes if the df_newExtractedrepresent a date that has not been updated by the site like for example: if we run the code today date and they still haven't update their records, the variable will be empty and crash all calculation.
Some suggest the issue related to writing to csv file and reading csv,which will change the format and make the file unstable, and I should use lubridate, that's why I have added the formatting lines. Any suggestion ?
I would like to plot the distribution of multiple columns of my data set. It has over 820.000 rows and 18 columns. I want to plot all columns except the columns with the dummy variables. I have already been able to create a graphic. But I want to have the values of the x-axis on the y-axis because these are the column values and I want to display their distribution for each column.
1. Definition of the path
setwd("C:/Users/A/Documents/Master BWL/Masterarbeit")
2. Loading the required packages
library(factoextra); library(cluster); library(skmeans); library(mclust);
library(fpc); library(psda); library(simEd); library (ggpubr);
library(dbscan); library(clustertend); library(MASS); library(devtools);
library(ggbiplot);library(NbClust); library(clValid); library(plotrix)
library(graphics); library(reshape2)
3. Import csv file
WKA_ohneJB <- read.csv("WKA_ohneJB_PCA.csv", header=TRUE, sep = ";", stringsAsFactors = FALSE)
4 Select columns
WKA_ohneJB2 <- c(WKA_ohneJB[, "BASKETS_NZ"], WKA_ohneJB[, "PIS"], WKA_ohneJB[, "PIS_AP"],
WKA_ohneJB[, "PIS_DV"], WKA_ohneJB[, "PIS_PL"], WKA_ohneJB [, "PIS_SDV"],
WKA_ohneJB[, "PIS_SHOPS"], WKA_ohneJB[,"PIS_SR"], WKA_ohneJB[, "QUANTITY"]
)
df <- melt(WKA_ohneJB2)
5 Plot
ggplot(df) +
geom_col(aes(x= WKA_ohneJB2 , y=value))
This is the plot I have generated so far.
Here is a part of my dataset:
dput(rbind(head(WKA_ohneJB, 10), tail(WKA_ohneJB, 10)))
structure(list(X = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L,
821039L, 821040L, 821041L, 821042L, 821043L, 821044L, 821045L,
821046L, 821047L, 821048L), BASKETS_NZ = c(1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L),
LOGONS = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), PIS = c(71L, 39L, 50L, 4L,
13L, 4L, 30L, 65L, 13L, 31L, 111L, 33L, 3L, 46L, 11L, 8L,
17L, 68L, 65L, 15L), PIS_AP = c(14L, 2L, 4L, 0L, 0L, 0L,
1L, 0L, 2L, 1L, 13L, 0L, 0L, 2L, 1L, 0L, 3L, 8L, 0L, 1L),
PIS_DV = c(3L, 19L, 4L, 1L, 0L, 0L, 6L, 2L, 2L, 3L, 38L,
8L, 0L, 5L, 2L, 0L, 1L, 0L, 3L, 2L), PIS_PL = c(0L, 5L, 8L,
2L, 0L, 0L, 0L, 24L, 0L, 6L, 32L, 8L, 0L, 0L, 4L, 0L, 0L,
0L, 0L, 0L), PIS_SDV = c(18L, 0L, 11L, 0L, 0L, 0L, 0L, 0L,
0L, 1L, 6L, 0L, 0L, 13L, 0L, 0L, 1L, 15L, 1L, 0L), PIS_SHOPS = c(3L,
24L, 13L, 3L, 0L, 0L, 6L, 28L, 2L, 11L, 71L, 16L, 2L, 5L,
6L, 0L, 1L, 0L, 3L, 2L), PIS_SR = c(19L, 0L, 14L, 0L, 0L,
0L, 2L, 23L, 0L, 3L, 6L, 0L, 0L, 20L, 0L, 0L, 3L, 32L, 1L,
0L), QUANTITY = c(13L, 2L, 18L, 1L, 14L, 1L, 4L, 2L, 5L,
1L, 5L, 2L, 2L, 4L, 1L, 3L, 2L, 8L, 17L, 8L), WKA = c(1L,
1L, 1L, 1L, 1L, 1L, 0L, 0L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 1L,
0L, 0L, 1L, 1L), NEW_CUST = c(0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), EXIST_CUST = c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L), WEB_CUST = c(1L, 0L, 0L, 0L, 1L, 1L, 0L,
1L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 1L), MOBILE_CUST = c(0L,
1L, 1L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
1L, 0L, 1L, 0L), TABLET_CUST = c(0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 0L, 1L, 0L, 0L),
LOGON_CUST_STEP2 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L)), row.names = c(1L,
2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 821039L, 821040L, 821041L,
821042L, 821043L, 821044L, 821045L, 821046L, 821047L, 821048L
), class = "data.frame")
6 Plotting histogram
var_to_plot = c("BASKETS_NZ","PIS","PIS_AP","PIS_DV","PIS_PL","PIS_SDV", "PIS_SHOPS","PIS_SR", "QUANTITY")
par(mfrow=c(3,3))
for(i in var_to_plot){hist(WKA_ohneJB[,i],xlab=i,main="")}
I have created several histograms. But the scaling of the axes is wrong. I want the numerical values of the x axis to appear on the y axis and the numerical values of the y axis to appear on the x axis. How does this work? I also want the values to be displayed completely and not as e^.
You don't need to combine your dataframe all over again. What you need is either a density plot or histogram.
Also as good practice, load only the packages required for plotting, in this case it would be maybe ggplot2 and tidyr.
For example, I just used an example with 5 of the column names I can see in your data:
library(tidyr)
library(ggplot2)
WKA_ohneJB = data.frame(dummyvar=1:10000,sapply(1:5,rnorm,n=10000))
colnames(WKA_ohneJB)[-1] = c("BASKETS_NZ","PIS","PIS_AP","PIS_DV","PIS_PL")
head(WKA_ohneJB)
dummyvar BASKETS_NZ PIS PIS_AP PIS_DV PIS_PL
1 1 0.92088518 0.9167877 1.956920 4.695379 4.349631
2 2 0.05335686 2.8225161 3.059749 4.317281 5.985579
3 3 1.00141759 3.5743033 2.499662 4.761415 5.886588
4 4 -1.31231486 2.5335004 5.396917 4.364643 5.866026
5 5 -0.65336724 0.2647117 3.203358 4.838659 4.437011
6 6 0.78769080 0.3630670 2.516433 3.826074 3.741611
To one of them do:
ggplot(WKA_ohneJB,aes(x=PIS)) + geom_histogram()
Or:
ggplot(WKA_ohneJB,aes(x=PIS)) + geom_density()
To plot everything at one go, you can try to pivot it long, as you have done with melt, but I don't know if your machine can handle it, so try it for a few variables first:
var_to_plot = c("BASKETS_NZ","PIS","PIS_AP","PIS_DV","PIS_PL")
dummyvar = "dummyvar"
ggplot(pivot_longer(WKA_ohneJB[,c(var_to_plot,dummyvar)],-dummyvar),
aes(x=value)) +
geom_histogram() +
facet_wrap(~name)
If melting the data.frame is too intensive, just use baseR plot:
# means 2 rows, 3 columns
par(mfrow=c(2,3))
for(i in var_to_plot){hist(WKA_ohneJB[,i],xlab=i,main="")}
events <- structure(list(ID = c(3049951, 3085397, 3204081, 3262134,
3467254), TVTProcedureStartDate = structure(c(16210, 16238, 16322,
16420, 16546), class = "Date"), DCDate = structure(c(16213, 16250,
16326, 16426, 16560), class = "Date"), CE_EventOccurred = c(0L,
0L, 0L, 0L, 0L), CE_EventDate = c(0L, 0L, 0L, 0L, 0L), `Annular Dissection (In Hospital)` = c(0L,
0L, 0L, 0L, 0L), `Aortic Dissection (In Hospital)` = c(0L, 0L,
0L, 1L, 0L), `Atrial Fibrillation (In Hospital)` = c(0L, 1L,
0L, 0L, 1L), `Bleeding at Access Site (In Hospital)` = c(0L,
0L, 0L, 0L, 0L), `Cardiac Arrest (In Hospital)` = c(1L, 0L, 0L,
0L, 0L), `Conduction/Native Pacer Disturbance Req ICD (In Hospital)` = c(0L,
0L, 1L, 0L, 0L), `Conduction/Native Pacer Disturbance Req Pacer (In Hospital)` = c(0L,
0L, 0L, 0L, 0L), `Endocarditis (In Hospital)` = c(0L, 0L, 0L,
0L, 0L), `GI Bleed (In Hospital)` = c(0L, 0L, 0L, 0L, 0L), `Hematoma at Access Site (In Hospital)` = c(0L,
0L, 0L, 0L, 0L), `Ischemic Stroke (In Hospital)` = c(0L, 0L,
0L, 0L, 0L), `Major Vascular Complications (In Hospital)` = c(0L,
0L, 0L, 0L, 0L), `Minor Vascular Complication (In Hospital)` = c(0L,
0L, 0L, 0L, 0L), `Mitral Leaflet Injury - detected during surgery (In Hospital)` = c(0L,
0L, 0L, 0L, 0L), `Mitral Subvalvular Injury -detected during surgery (In Hospital)` = c(0L,
0L, 0L, 0L, 0L), `New Requirement for Dialysis (In Hospital)` = c(0L,
0L, 0L, 0L, 0L), `Other Bleed (In Hospital)` = c(0L, 0L, 0L,
0L, 0L), `Perforation with or w/o Tamponade (In Hospital)` = c(1L,
0L, 0L, 0L, 0L), `Retroperitoneal Bleeding (In Hospital)` = c(0L,
0L, 0L, 0L, 0L), `Single Leaflet Device Attachment (In Hospital)` = c(0L,
0L, 0L, 0L, 0L), `Unplanned Other Cardiac Surgery or Intervention (In Hospital)` = c(0L,
0L, 0L, 0L, 0L), `Unplanned Vascular Surgery or Intervention (In Hospital)` = c(0L,
0L, 0L, 1L, 0L)), class = c("grouped_df", "tbl_df", "tbl", "data.frame"
), row.names = c(NA, -5L), vars = "NCDRPatientID", labels = structure(list(
NCDRPatientID = c(3049951, 3085397, 3204081, 3262134, 3467254
)), class = "data.frame", row.names = c(NA, -5L), vars = "NCDRPatientID", labels = structure(list(
NCDRPatientID = c(3049951, 3085397, 3204081, 3262134, 3467254,
3467324, 3510387, 3586037, 3661089, 3668621, 3679485, 3737916,
3738064, 3960141, 4006862, 4018241, 4019056, 4025174, 4027490,
4050900, 4051101, 4096816, 4097119, 4097146, 4097180, 4098426,
4106410, 4109968, 4147466, 4198427, 4198450, 4198458, 4204554,
4208053, 4213116, 4218802, 4218854, 4223378, 4223415, 4243959,
4316979, 4341660, 4348676, 4413567, 4419513, 4421948, 4422768,
4426483, 4430159, 4431211, 4433156, 4433406, 4433988)), class = "data.frame", row.names = c(NA,
-53L), vars = "NCDRPatientID", labels = structure(list(NCDRPatientID = c(3049951,
3085397, 3204081, 3262134, 3467254, 3467324, 3510387, 3586037,
3661089, 3668621, 3679485, 3737916, 3738064, 3960141, 4006862,
4018241, 4019056, 4025174, 4027490, 4050900, 4051101, 4096816,
4097119, 4097146, 4097180, 4098426, 4106410, 4109968, 4147466,
4198427, 4198450, 4198458, 4204554, 4208053, 4213116, 4218802,
4218854, 4223378, 4223415, 4243959, 4316979, 4341660, 4348676,
4413567, 4419513, 4421948, 4422768, 4426483, 4430159, 4431211,
4433156, 4433406, 4433988)), class = "data.frame", row.names = c(NA,
-53L), vars = "NCDRPatientID", drop = TRUE), indices = list(0L,
1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10:12, 13L, 14L, 15L,
16:17, 18L, 19:21, 22L, 23L, 24L, 25:26, 27L, 28L, 29:30,
31L, 32:33, 34L, 35:38, 39L, 40:41, 42L, 43L, 44L, 45L, 46L,
47L, 48:50, 51:53, 54L, 55L, 56L, 57L, 58L, 59:60, 61L, 62L,
63:64, 65:66, 67:68, 69L, 70L, 71:72, 73L), drop = TRUE, group_sizes = c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 1L, 1L, 1L, 2L, 1L, 3L,
1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 1L, 4L, 1L, 2L, 1L, 1L, 1L,
1L, 1L, 1L, 3L, 3L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 2L, 2L,
1L, 1L, 2L, 1L), biggest_group_size = 4L), indices = list(0L,
1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L,
15L, 16L, 17L, 18L, 19L, 20L, 21L, 22L, 23L, 24L, 25L, 26L,
27L, 28L, 29L, 30L, 31L, 32L, 33L, 34L, 35L, 36L, 37L, 38L,
39L, 40L, 41L, 42L, 43L, 44L, 45L, 46L, 47L, 48L, 49L, 50L,
51L, 52L), drop = TRUE, group_sizes = c(1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L
), biggest_group_size = 1L), indices = list(0L, 1L, 2L, 3L, 4L), drop = TRUE, group_sizes = c(1L,
1L, 1L, 1L, 1L), biggest_group_size = 1L)
From this data, I need to create a column that has value 1 if any of the columns which ends in (in-hospital) contains 1 else 0.
I tried multiple things but either doesn't work or displays error
Error in mutate_impl(.data, dots) : Evaluation error: NA/NaN argument.
event %>% mutate(TR = rowSums(select_(.,6:n)))
Error in mutate_impl(.data, dots) : Column `TR` must be length 1 (the group size), not 53
event %>% mutate(TR = rowSums(.[6:ncol(.)]))
And some other variations of it to see if I can understand or make some sense, but it keeps running into the similar errors and problems
Another thing i tried was the following which seems to do the row sums, but it also adds the ID even when I'm doing the following:
event %>% select(6:27) %>% rowSums()
but it added the ID with the 1s and 0s from columns 6 to 27 for each row. Not sure why it's doing this.
I want the results as a data frame with the same data, but also a column with 1s if any of the columns from 6 to 27 contains 1 otherwise 0
Before I developed my solution, I ran the following code to ungroup your data.
library(dplyr)
events <- events %>% ungroup()
Solution 1: rowSums with selected columns
The idea of this solution is to use rowSums to add all the numbers from the selected columns, determine if the sum is larger than 0, and then convert the logical vector to an integer vector (with 1 or 0).
There are many ways to select the columns. We can select based on column numbers.
events2 <- events %>% mutate(Col = as.integer(rowSums(select(., 6:27)) > 0))
events2$Col
# [1] 1 1 1 1 1
We can use ends_with.
events2 <- events %>% mutate(Col = as.integer(rowSums(select(., ends_with("(In Hospital)"))) > 0))
events2$Col
# [1] 1 1 1 1 1
We can use matches. The regular expression \\(In Hospital\\)$ indicates the string at the end.
events2 <- events %>% mutate(Col = as.integer(rowSums(select(., matches("\\(In Hospital\\)$"))) > 0))
events2$Col
# [1] 1 1 1 1 1
We can use contains, but notice that the target string does not need to be in the end of the column names.
events2 <- events %>% mutate(Col = as.integer(rowSums(select(., contains("(In Hospital)"))) > 0))
events2$Col
# [1] 1 1 1 1 1
Solution 2: apply with max
Since the numbers from the target columns are all 1 or 0, we can use apply with max to get the maximum, which will be 1 if there ara any 1, or 0. All the ways to use the select function as was shown above will also work here. Below I presented one way to do this.
events2 <- events %>% mutate(Col = apply(select(., ends_with("(In Hospital)")), 1, max))
events2$Col
# [1] 1 1 1 1 1
It is not a dplyr way, but it also works:
events$new_col <- 0
events$new_col[rowSums(events[, grep("In Hospital", colnames(events))]) >= 1] <- 1
A solution from base R using apply()
cols <- grep("in hospital", colnames(events), ignore.case = T)
apply(events[, cols], 1, function(x) ifelse(any(x == 1), 1, 0))
# [1] 1 1 1 1 1
I have a data frame
structure(list(id = c(31068L, 4741L, 31068L, 48783L, 48783L,
4741L, 40866L, 602L, 43498L, 602L, 47365L, 27643L, 30657L, 47365L,
2456L, 30657L, 48783L, 40866L, 31068L, 31068L), chrom = structure(c(10L,
2L, 10L, 18L, 18L, 2L, 1L, 12L, 15L, 12L, 3L, 14L, 13L, 3L, 5L,
13L, 18L, 1L, 10L, 10L), .Label = c("chr1", "chr10", "chr11",
"chr12", "chr13", "chr14", "chr15", "chr16", "chr17", "chr18",
"chr19", "chr2", "chr3", "chr4", "chr5", "chr6", "chr7", "chr8",
"chr9", "chrM", "chrUn", "chrX", "chrY"), class = "factor"),
strand = structure(c(2L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 2L,
1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L), .Label = c("-",
"+"), class = "factor"), position = c(32233920L, 63320175L,
32233915L, 105252695L, 105252690L, 63320180L, 160222550L,
60750690L, 111386910L, 60750685L, 89019895L, 155820080L,
146439265L, 89019890L, 56738465L, 146439260L, 105252700L,
160222545L, 32233910L, 32233925L), state = c(1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L), probability = c(0.0609068958401659, 0.0652616906258002,
0.0692068771991058, 0.0692812165331229, 0.0701354955671266,
0.0703768347393804, 0.0711103024384338, 0.0741845396230543,
0.0743938664894217, 0.0749688062517259, 0.079612884618025,
0.0813520378059492, 0.0826542716746912, 0.0846949902404464,
0.0861396527789957, 0.0864410597153605, 0.0871058913414357,
0.087430325878555, 0.0878512688974869, 0.0880022007028806
), differential = c(-0.23435362433765, -0.23435362433765,
-0.171785943061141, 0.196483908128415, 0.251738193935959,
-0.23435362433765, -0.23435362433765, -0.278594311118605,
0.242648431724363, -0.278594311118605, -0.278594311118605,
0.209140348612477, -0.28152682524624, -0.234047089198976,
0.290374904597332, -0.242344380735883, 0.196483908128415,
-0.23435362433765, -0.171785943061141, -0.155502764975432
), tag1 = c(0L, 0L, 0L, 15L, 15L, 0L, 0L, 5L, 30L, 5L, 5L,
10L, 2L, 3L, 15L, 7L, 15L, 0L, 0L, 0L), mut1 = c(0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L), tag2 = c(15L, 15L, 11L, 5L, 2L, 15L, 15L, 25L,
15L, 25L, 25L, 0L, 21L, 19L, 0L, 25L, 5L, 15L, 11L, 10L),
mut2 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), group = c(1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1)), .Names = c("id",
"chrom", "strand", "position", "state", "probability", "differential",
"tag1", "mut1", "tag2", "mut2", "group"), class = c("data.table",
"data.frame"), row.names = c(NA, -20L), .internal.selfref = <pointer: 0x102819778>)
and I would like to create a new column df$bin such that the bin column creates bins of length n of consecutive values.
For example, if n = 2, I would want there to be 2 bins where the bin column looks like bin = c(1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2)
such that the values in for df$bin would be 1:n and each value would be repeated nrow(df)/n times.
You can use the base R function cut for this:
df$bin = cut(1:nrow(df), breaks = 2, labels = FALSE)
I have a dataframe with:
"serial" the number of households, each one with a variable number of components "head, spouse, parent and child or grandchild" and total number of children in the house "nchild"
I want to create a new variable (in the dput I added an example for clarity: withCM 'living with male child' and withCF). I have tried various combinations but I cannot discriminate on the sex of the child within the same "serial", so that for withCM=1 only when relate=="child"&sex==1, but the 1 would appear on a different row (that of the head, spouse or parent)
mydata$withCM<- ifelse(mydata$nchild>0&mydata$relate!="child",1,0)
mydata <- structure(list(serial = c(12345L, 12345L, 12345L, 12345L, 12346L,
12346L, 12347L, 12347L, 12347L, 12348L, 12348L, 12348L, 12348L,
12348L, 12348L, 12348L, 12349L, 12350L, 12350L, 12351L, 12351L,
12351L, 12352L, 12352L, 12352L, 12352L, 12352L, 12353L, 12354L,
12354L), age = c(45L, 44L, 13L, 11L, 29L, 28L, 65L, 61L, 35L,
68L, 61L, 35L, 34L, 6L, 2L, 1L, 62L, 54L, 52L, 67L, 67L, 12L,
49L, 50L, 28L, 21L, 22L, 70L, 89L, 55L), sex = c(1L, 2L, 2L,
1L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 1L,
1L, 2L, 1L, 1L, 2L, 1L, 2L, 2L, 2L, 2L, 2L), relate = structure(c(4L,
7L, 1L, 1L, 4L, 7L, 6L, 6L, 4L, 4L, 7L, 1L, 2L, 3L, 3L, 3L, 4L,
4L, 7L, 4L, 7L, 3L, 4L, 7L, 1L, 5L, 5L, 4L, 6L, 4L), .Label = c("child",
"childinlaw", "grandchild", "head", "nonrelative", "parent",
"spouse"), class = "factor"), nchild = c(2L, 2L, 0L, 0L, 0L,
0L, 1L, 1L, 0L, 1L, 1L, 3L, 3L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 1L, 1L, 0L, 0L, 0L, 0L, 1L, 0L), conhija = c(1L, 1L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L), conhijo = c(1L,
1L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L)), .Names = c("serial",
"age", "sex", "relate", "nchild", "conhija", "conhijo"), class = "data.frame", row.names = c(NA,
-30L))
You can tabulate the gender, family, and role-within-family as:
xtab <- table(mydata$serial, mydata$sex, mydata$relate)
And then choose the heads of the families (or, in the commented line, anyone who has the specific relationship), and alter their tallies as follows:
mydata$sex1 <- 0
mydata$sex2 <- 0
ind <- mydata$relate=="head"
#ind <- mydata$relate %in% c("head","spouse","parent")
mydata$sex1[ind] <- xtab[as.character(mydata$serial[ind]), "1", "child"]
mydata$sex2[ind] <- xtab[as.character(mydata$serial[ind]), "2", "child"]
Use lapply to split into families, then test if they are an adult, and there is at least one male child in the unit.
lives_with_boy <- function(serial)
{
unit <- mydata[mydata$serial==serial,]
as.character(unit$relate) %in% c("head","spouse","parent") & any(unit$relate == "child" & unit$sex==1)
}
mydata$withCM <- unlist(lapply(unique(mydata$serial),lives_with_boy ))