Merge 2 resulting vectors into 1 data frame using R - r

I have a df like this
Month <- c('JAN','JAN','JAN','JAN','FEB','FEB','MAR','APR','MAY','MAY')
Category <- c('A','A','B','C','A','E','B','D','E','F')
Year <- c(2014,2015,2015,2015,2014,2013,2015,2014,2015,2013)
Number_Combinations <- c(3,2,3,4,1,3,6,5,1,1)
df <- data.frame(Month ,Category,Year,Number_Combinations)
df
Month Category Year Number_Combinations
1 JAN A 2014 3
2 JAN A 2015 2
3 JAN B 2015 3
4 JAN C 2015 4
5 FEB A 2014 1
6 FEB E 2013 3
7 MAR B 2015 6
8 APR D 2014 5
9 MAY E 2015 1
10 MAY F 2013 1
I have another df that I got from the above dataframe with a condition
df1 <- subset(df,Number_Combinations > 2)
df1
Month Category Year Number_Combinations
1 JAN A 2014 3
3 JAN B 2015 3
4 JAN C 2015 4
6 FEB E 2013 3
7 MAR B 2015 6
8 APR D 2014 5
Now I want to create a table reporting the month, the total number of rows for the month in df and the total number of for the month in df1
Desired Output would be
Month Number_Month_df Number_Month_df1
1 JAN 4 3
2 FEB 2 1
3 MAR 1 1
4 APR 1 1
5 MAY 2 0
While I used table(df) and table(df1) and tried merging but not getting the desired result. Could someone please help me in getting the above dataframe?

We get the table of the 'Month' column from both 'df' and 'df1', convert to 'data.frame' (as.data.frame), merge by the 'Var1', and change the column names accordingly.
res <- merge(as.data.frame(table(df$Month)),
as.data.frame(table(df1$Month)), by='Var1')
colnames(res) <- c('Month', 'Number_Month_df', 'Number_Month_df1')

res <- data.frame(Number_Month_df=sort(table(df$Month),T),
Number_Month_df1=sort(table(df1$Month),T))
res$Month <- rownames(res)

Related

Create incremental column year based on id and year column in R

I have the below dataframe and i want to create the 'create_col' using some kind of seq() function i guess using the 'year' column as the start of the sequence. How I could do that?
id <- c(1,1,2,3,3,3,4)
year <- c(2013, 2013, 2015,2017,2017,2017,2011)
create_col <- c(2013,2014,2015,2017,2018,2019,2011)
Ideal result:
id year create_col
1 1 2013 2013
2 1 2013 2014
3 2 2015 2015
4 3 2017 2017
5 3 2017 2018
6 3 2017 2019
7 4 2011 2011
You can add row_number() to minimum year in each id :
library(dplyr)
df %>%
group_by(id) %>%
mutate(create_col = min(year) + row_number() - 1)
# id year create_col
# <dbl> <dbl> <dbl>
#1 1 2013 2013
#2 1 2013 2014
#3 2 2015 2015
#4 3 2017 2017
#5 3 2017 2018
#6 3 2017 2019
#7 4 2011 2011
data
df <- data.frame(id, year)

Combine data in many row into a columnn

I have a data like this:
year Male
1 2011 8
2 2011 1
3 2011 4
4 2012 3
5 2012 12
6 2012 9
7 2013 4
8 2013 3
9 2013 3
and I need to group the data for the year 2011 in one column, 2012 in the next column and so on.
2011 2012 2013
1 8 3 4
2 1 12 3
3 4 9 3
How do I achieve this?
One option is unstack if the number of rows per 'year' is the same
unstack(df1, Male ~ year)
One option is to use functions from dplyr and tidyr.
library(dplyr)
library(tidyr)
dt2 <- dt %>%
group_by(year) %>%
mutate(ID = 1:n()) %>%
spread(year, Male) %>%
select(-ID)
1
If every year has the same number of data, you could split the data and cbind it using base R
do.call(cbind, split(df$Male, df$year))
# 2011 2012 2013
#[1,] 8 3 4
#[2,] 1 12 3
#[3,] 4 9 3
2
If every year does not have the same number of data, you could use rbind.fill of plyr
df[10,] = c(2015, 5) #Add only one data for the year 2015
library(plyr)
setNames(object = data.frame(t(rbind.fill.matrix(lapply(split(df$Male, df$year), t)))),
nm = unique(df$year))
# 2011 2012 2013 2015
#1 8 3 4 5
#2 1 12 3 NA
#3 4 9 3 NA
3
Yet another way is to use dcast to convert data from long to wide format
df[10,] = c(2015, 5) #Add only one data for the year 2015
library(reshape2)
dcast(df, ave(df$Male, df$year, FUN = seq_along) ~ year, value.var = "Male")[,-1]
# 2011 2012 2013 2015
#1 8 3 4 5
#2 1 12 3 NA
#3 4 9 3 NA

duplicating/replicating only specific rows in a data frame

I have data acording to uniue id and sorted on date of visit. Some people have multiple visits. Data is in the long format sorted by visit. I only want to replicate a row of the last visit of each person. How does one replicate only specific rows in a data frame?
id visit glucose
1 12 Jan 2015 12
1 3 Feb 2015 8
2 1 Feb 2015 13
3 12 Jan 2015 7
3 4 Feb 2015 13
3 1 March 2015 8
If we need to duplicate the last row based on the 'visit' for each 'id', we can use data.table. Convert the 'data.frame' to 'data.table' (setDT(df1)), order by 'id', and 'visit', grouped by 'id', we replicate the last row (.N)
library(data.table)
setDT(df1)[order(id, as.Date(visit, "%d %b %Y")), .SD[c(seq_len(.N), .N)], by = id]
# id visit glucose
#1: 1 12 Jan 2015 12
#2: 1 3 Feb 2015 8
#3: 1 3 Feb 2015 8
#4: 2 1 Feb 2015 13
#5: 2 1 Feb 2015 13
#6: 3 12 Jan 2015 7
#7: 3 4 Feb 2015 13
#8: 3 1 March 2015 8
#9: 3 1 March 2015 8
If we want only the last row for each 'id'
setDT(df1)[order(id, as.Date(visit, "%d %b %Y")), .SD[.N], id]

Create groups based on time period

How can I create a new grouping variable for my data based on 5-year steps?
So from this:
group <- c(rep("A", 7), rep("B", 10))
year <- c(2008:2014, 2005:2014)
dat <- data.frame(group, year)
group year
1 A 2008
2 A 2009
3 A 2010
4 A 2011
5 A 2012
6 A 2013
7 A 2014
8 B 2005
9 B 2006
10 B 2007
11 B 2008
12 B 2009
13 B 2010
14 B 2011
15 B 2012
16 B 2013
17 B 2014
To this:
> dat
group year period
1 A 2008 2005_2009
2 A 2009 2005_2009
3 A 2010 2010_2014
4 A 2011 2010_2014
5 A 2012 2010_2014
6 A 2013 2010_2014
7 A 2014 2010_2014
8 B 2005 2005_2009
9 B 2006 2005_2009
10 B 2007 2005_2009
11 B 2008 2005_2009
12 B 2009 2005_2009
13 B 2010 2010_2014
14 B 2011 2010_2014
15 B 2012 2010_2014
16 B 2013 2010_2014
17 B 2014 2010_2014
I guess I could use cut(dat$year, breaks = ??) but I don't know how to set the breaks.
Here is one way of doing it:
dat$period <- paste(min <- floor(dat$year/5)*5, min+4,sep = "_")
I guess the trick here is to get the biggest whole number smaller than your year with the floor(year/x)*x function.
Here is a version that should work generally:
x <- 5
yearstart <- 2000
dat$period <- paste(min <- floor((dat$year-yearstart)/x)*x+yearstart,
min+x-1,sep = "_")
You can use yearstart to ensure e.g. year 2000 is the first in a group for when x is not a multiple of it.
cut should do the job if you create actual Date objects from your 'year' column.
## convert 'year' column to dates
yrs <- paste0(dat$year, "-01-01")
yrs <- as.Date(yrs)
## create cuts of 5 years and add them to data.frame
dat$period <- cut(yrs, "5 years")
## create desired factor levels
library(lubridate)
lvl <- as.Date(levels(dat$period))
lvl <- paste(year(lvl), year(lvl) + 4, sep = "_")
levels(dat$period) <- lvl
head(dat)
group year period
1 A 2008 2005_2009
2 A 2009 2005_2009
3 A 2010 2010_2014
4 A 2011 2010_2014
5 A 2012 2010_2014
6 A 2013 2010_2014

data standardization for all group data.frame in R

I have a dataset as below
Date <- rep(c("Jan", "Feb"), 3)[1:5]
Group <- c(rep(letters[1:2],each=2),"c")
value <- sample(1:10,5)
data <- data.frame(Date, Group, value)
> data
Date Group value
1 Jan a 2
2 Feb a 7
3 Jan b 3
4 Feb b 9
5 Jan c 1
As you can observed, for group c it do not have data on Date=Feb.
How can i make a dataset such that
> DATA
Date Group value
1 Jan a 2
2 Feb a 7
3 Jan b 3
4 Feb b 9
5 Jan c 1
6 Feb c 0
I have added last row such that value for group c in feb is 0.
Thanks
With base R you can use xtabs wrapped in as.data.frame:
as.data.frame(xtabs(formula = value ~ Date + Group, data = data))
# Date Group Freq
#1 Feb a 8
#2 Jan a 6
#3 Feb b 4
#4 Jan b 1
#5 Feb c 0
#6 Jan c 10
Using merge:
#get all combinations of 2 columns
all.comb <- expand.grid(unique(data$Date),unique(data$Group))
colnames(all.comb) <- c("Date","Group")
#merge with all.x=TRUE to keep nonmatched rows
res <- merge(all.comb,data,all.x=TRUE)
#convert NA to 0
res$value[is.na(res$value)] <- 0
#result
res
# Date Group value
# 1 Feb a 3
# 2 Feb b 4
# 3 Feb c 0
# 4 Jan a 5
# 5 Jan b 7
# 6 Jan c 10
Using reshape2
library(reshape2)
melt(dcast(data, Date~Group, value.var="value",fill=0), id.var="Date") #values differ as there was no set.seed()
# Date variable value
#1 Feb a 1
#2 Jan a 10
#3 Feb b 7
#4 Jan b 4
#5 Feb c 0
#6 Jan c 5
Or using dplyr
library(dplyr)
library(tidyr)
data%>%
spread(Group, value, fill=0) %>%
gather(Group, value, a:c)
# Date Group value
#1 Feb a 1
#2 Jan a 10
#3 Feb b 7
#4 Jan b 4
#5 Feb c 0
#6 Jan c 5

Resources