Related
I have a data set that have id date and time, now I want to calculate the difference between each available date based on id. I have try to look for similar problem in stack overflow but so far no luck. I have try a few different syntax but still no luck at the moment. any help would be great.
data set:
> dput(mydata)
structure(list(id = c("a", "a", "b", "b", "b", "c"), date = c("2018-04-13",
"2011-11-12", "2019-05-30", "2014-09-13", "2019-06-21", "1998-01-08"
), time = c("50", "40", "30", "20", "10", "30")), class = "data.frame", row.names = c(NA,
-6L))
Desire output:
id date time time_diff
a 2018-04-13 50 10
a 2011-11-12 40 NA/0
b 2019-05-30 30 10
b 2014-09-13 20 NA/0
b 2019-06-21 10 -20
c 1998-01-08 30 NA/0
I understand the earliest date won't have anything to calculate the difference so it can be either NA or 0 in this case.
Here is the code that I have try but getting error:
mydata <- mydata %>%
group_by(id,date) %>%
mutate(time_diff = diff(time))
library(dplyr)
df <- structure(list(
id = c("a", "a", "b", "b", "b", "c"),
date = ("2018-04-13", "2011-11-12", "2019-05-30", "2014-09-13", "2019-06-21", "1998-01-08"),
time = c("50", "40", "30", "20", "10", "30")),
class = "data.frame", row.names = c(NA, -6L))
df %>%
group_by(id) %>%
arrange(id, date) %>%
mutate(
time = as.numeric(time),
time_diff = time - lag(time)
)
For each id you may subtract the time corresponding to minimum date.
library(dplyr)
mydata %>%
mutate(time = as.numeric(time),
date = as.Date(date)) %>%
group_by(id) %>%
mutate(time_diff = time - time[which.min(date)]) %>%
ungroup
# id date time time_diff
# <chr> <date> <dbl> <dbl>
#1 a 2018-04-13 50 10
#2 a 2011-11-12 40 0
#3 b 2019-05-30 30 10
#4 b 2014-09-13 20 0
#5 b 2019-06-21 10 -10
#6 c 1998-01-08 30 0
We can use data.table
library(data.table)
mydata <- type.convert(mydata, as.is = TRUE)
setDT(mydata)[, time_diff := time - time[date %in% min(date)], id]
mydata
id date time time_diff
1: a 2018-04-13 50 10
2: a 2011-11-12 40 0
3: b 2019-05-30 30 10
4: b 2014-09-13 20 0
5: b 2019-06-21 10 -10
6: c 1998-01-08 30 0
I have two dataframes with two columns each, that I would like to plot together as a barplot using ggplot in R as shown below:
How can I do this using dplyr in R?
Sample Data:
DF1
Code Count_2020
A 1
B 2
C 3
D 4
E 5
F 6
DF2
Code Count_2021
A 4
B 8
C 6
D 8
E 10
F 12
So, I first thought of merging the two dataframes into one using dplyr::inner_join, and I got a new dataframe as shown below:
Code Count_2021 Count_2020
A 4 1
B 8 2
C 6 3
D 8 4
E 10 5
F 12 6
Next I thought of using dplyr::gather to plot the count data from both years together as Type and Value, but this messed up the gathered dataframe as the output changed to:
Type Value
Code A
Code B
Code C
Code D
Code E
Code F
Code I tried
library(tidyverse)
# Merge DF1 and DF2
DF = inner_join(DF1, DF2)
# Gather data for plotting
Gathered_DF= DF%>% dplyr::select(Code, Count_2020, Count_2021) %>%
gather(key = Type, value = Value) # Output not as expected, stuck!!
We can reshape to 'long' format with pivot_longer after the join and then use geom_col in ggplot2 with position specified as 'dodge' and fill as 'Year`
library(dplyr)
library(tidyr)
library(ggplot2)
inner_join(DF1, DF2) %>%
pivot_longer(cols = -Code, names_to = 'Year', names_prefix = 'Count_') %>%
ggplot(aes(x = Code, y = value, fill = Year)) +
geom_col(position = 'dodge') +
theme_bw()
-output
data
DF1 <- structure(list(Code = c("A", "B", "C", "D", "E", "F"),
Count_2020 = 1:6), class = "data.frame", row.names = c(NA,
-6L))
DF2 <- structure(list(Code = c("A", "B", "C", "D", "E", "F"), Count_2021 = c(4L,
8L, 6L, 8L, 10L, 12L)), class = "data.frame", row.names = c(NA,
-6L))
You can use pivot_longer instead of gather as it is superseded in tidyr 1.1.3
library(tidyverse)
df1 <- data.frame(Code = c("A", "B", "C", "D", "E", "F"),
Count_2020 = c(1,2,3,4,5,6))
df2 <- data.frame(Code = c("A", "B", "C", "D", "E", "F"),
Count_2021 = c(4, 8, 6, 8, 10, 12))
df_joined <- df1 %>%
inner_join(df2, by = "Code") %>%
pivot_longer(cols = !Code, names_to = "Year", names_prefix = "Count_", values_to = "Count")
df_joined
#> # A tibble: 12 x 3
#> Code Year Count
#> <fct> <chr> <dbl>
#> 1 A 2020 1
#> 2 A 2021 4
#> 3 B 2020 2
#> 4 B 2021 8
#> 5 C 2020 3
#> 6 C 2021 6
#> 7 D 2020 4
#> 8 D 2021 8
#> 9 E 2020 5
#> 10 E 2021 10
#> 11 F 2020 6
#> 12 F 2021 12
ggplot(df_joined, aes(x = Code, y = Count, fill = Year)) +
geom_bar(stat = "identity", position = "dodge")
In the code above, the argument inside pivot_longer are:
cols = !Code it means the column to be pivoted that is all column except Code
names_to = "Year" it means the name of column to be created for grouping
names_prefix = "Count_" is used to remove the string "Count_" from the created column "Year"
values_to = "Count" it means the name of column to created for stored value from each group.
You can learn more about this function by simply call ?pivot_longer
Use pivot_longer() to reshape your data then plot using ggplot.
Bonus: to add text on the bars, use geom_bar_text from the ggfittext package
library(tidyverse)
DF1 <- read.table(text = "Code Count_2020
A 1
B 2
C 3
D 4
E 5
F 6", header = TRUE)
DF2 <- read.table(text = "Code Count_2021
A 4
B 8
C 6
D 8
E 10
F 12", header = TRUE)
DF <- left_join(DF1, DF2, by = "Code")
DF_long <- DF %>%
pivot_longer(-Code,
names_to = c("tmp", "Year"),
names_sep = "\\_",
values_to = "Count") %>%
select(-tmp)
DF_long
#> # A tibble: 12 x 3
#> Code Year Count
#> <chr> <chr> <int>
#> 1 A 2020 1
#> 2 A 2021 4
#> 3 B 2020 2
#> 4 B 2021 8
#> 5 C 2020 3
#> 6 C 2021 6
#> 7 D 2020 4
#> 8 D 2021 8
#> 9 E 2020 5
#> 10 E 2021 10
#> 11 F 2020 6
#> 12 F 2021 12
plt <- ggplot(DF_long, aes(x = Code,
y = Count,
fill = Year)) +
geom_col(position = position_dodge(width = 0.9)) +
theme_minimal()
plt
library(ggfittext)
plt +
geom_bar_text(position = "dodge", reflow = TRUE)
Created on 2021-08-05 by the reprex package (v2.0.1)
I also found another way of doing it:
library(tidyverse)
DF1 = data.frame(Code = c("A", "B", "C", "D", "E", "F"),
Count_2020 = c(1,2,3,4,5,6))
DF2 = data.frame(Code = c("A", "B", "C", "D", "E", "F"),
Count_2021 = c(4, 8, 6, 8, 10, 12))
DF_Merged =
inner_join(DF1, DF2)
DFF_Merged = DF_Merged %>% dplyr::select(Code, Count_2020, Count_2021) %>%
gather(key = Type, value = Value, -Code) %>%
mutate(Type = ifelse(Type == "Count_2020", "2020", "2021"))
DFF_Merged %>%
ggplot(aes(x = reorder(Code,Value), y = Value, fill = Type,
text = paste("Count:", Value,
"<br>", "Offense Code:", Code,
"<br>", "Year:", Type))) +
geom_col(position = "dodge", show.legend = FALSE) +
xlab("Offense Code") +
ylab("Count") +
ggtitle("Arrest Counts for Group 1 in Year 2020 and 2021") +
theme(axis.text=element_text(size=8))
Result
I have a data frame that contains the monetary transactions among individuals. The transactions can be two-way, i.e. A can transfer money to B and B can also transfer money to A. The structure of the data frame looks like below:
From To Amount
A B $100
A C $40
A D $30
B A $25
B C $70
C A $190
C D $110
I want to summarize the total amount of transactions among each pair of individuals who have transactions with each other and the results should be something like:
Individual_1 Individual_2 Sum
A B $125
A C $230
A D $30
B C $70
C D $110
I tried to utilize the grouping feature of the package dplyr but I think it does not apply to my case.
You can use pmin/pmax to sort From and To columns and sum the Amount value.
library(dplyr)
df %>%
group_by(col1 = pmin(From, To),
col2 = pmax(From, To)) %>%
summarise(Amount = sum(readr::parse_number(Amount)))
# col1 col2 Amount
# <chr> <chr> <dbl>
#1 A B 125
#2 A C 230
#3 A D 30
#4 B C 70
#5 C D 110
Using the same logic in base R you can do :
aggregate(Amount~col1 + col2,
transform(df, col1 = pmin(From, To), col2 = pmax(From, To),
Amount = as.numeric(sub('$', '', Amount, fixed = TRUE))), sum)
data
df <- structure(list(From = c("A", "A", "A", "B", "B", "C", "C"), To = c("B",
"C", "D", "A", "C", "A", "D"), Amount = c("$100", "$40", "$30",
"$25", "$70", "$190", "$110")), class = "data.frame", row.names = c(NA, -7L))
A solution using the tidyverse package. You need to find a way to create a common grouping column with the right order of the individuals. dat2 is the final output.
library(tidyverse)
dat2 <- dat %>%
mutate(Amount = as.numeric(str_remove(Amount, "\\$"))) %>%
mutate(Group = map2_chr(From, To, ~str_c(sort(c(.x, .y)), collapse = "_"))) %>%
group_by(Group) %>%
summarize(Sum = sum(Amount, na.rm = TRUE)) %>%
separate(Group, into = c("Individual_1", "Individual_2"), sep = "_") %>%
mutate(Sum = str_c("$", Sum))
print(dat2)
# # A tibble: 5 x 3
# Individual_1 Individual_2 Sum
# <chr> <chr> <chr>
# 1 A B $125
# 2 A C $230
# 3 A D $30
# 4 B C $70
# 5 C D $110
Data
dat <- read.table(text = "From To Amount
A B $100
A C $40
A D $30
B A $25
B C $70
C A $190
C D $110",
header = TRUE)
A complete solution without packages, based on #RonakShah's great pmin/pmax approach, using list notation in aggregate (in contrast to formula notation) which allows name assignment.
with(
transform(d, a=as.numeric(gsub("\\D", "", Amount)), b=pmin(From, To), c=pmax(From, To)),
aggregate(list(Sum=a), list(Individual_1=b, Individual_2=c), function(x)
paste0("$", sum(x))))
# Individual_1 Individual_2 Sum
# 1 A B $125
# 2 A C $230
# 3 B C $70
# 4 A D $30
# 5 C D $110
Data:
d <- structure(list(From = c("A", "A", "A", "B", "B", "C", "C"), To = c("B",
"C", "D", "A", "C", "A", "D"), Amount = c("$100", "$40", "$30",
"$25", "$70", "$190", "$110")), class = "data.frame", row.names = c(NA,
-7L))
I have a .txt file
test.txt
V1 V2 Date
A B 2020-01-02
C D 2020-02-27
E F 2020-09-10
G H 2020-09-15
I want to subset data based on the most recent month.
I did this which does the job but I want to extract the most recent month automatically rather than typing in manually and then extract the data
test$month <- factor(format(test$Date, "%B"),levels = month.name)
test.subset <- test[test$month == "September"]
We can arrange the Date class column and filter the formated value by comparing it with the last one
library(dplyr)
test %>%
mutate(Date = as.Date(Date), Month = format(Date, '%B')) %>%
arrange(Date) %>%
filter(Month == last(Month)) %>%
select(-Month)
-output
# V1 V2 Date
#1 E F 2020-09-10
#2 G H 2020-09-15
data
test <- structure(list(V1 = c("A", "C", "E", "G"), V2 = c("B", "D", "F",
"H"), Date = c("2020-01-02", "2020-02-27", "2020-09-10", "2020-09-15"
)), class = "data.frame", row.names = c(NA, -4L))
Here is a base R option using subset + gsub
subset(
transform(
df,
ym = gsub("\\d+$", "", Date)
),
ym == max(ym),
select = -ym
)
which gives
V1 V2 Date
3 E F 2020-09-10
4 G H 2020-09-15
A data.table option
setDT(df)[
,
`:=`(Year = year(as.IDate(Date)), Month = month(as.Date(Date)))
][
.(max(Year), max(Month)),
on = .(Year, Month)
][
,
`:=`(Year = NULL, Month = NULL)
][]
gives
V1 V2 Date
1: E F 2020-09-10
2: G H 2020-09-15
Data
> dput(df)
structure(list(V1 = c("A", "C", "E", "G"), V2 = c("B", "D", "F",
"H"), Date = c("2020-01-02", "2020-02-27", "2020-09-10", "2020-09-15"
)), class = "data.frame", row.names = c(NA, -4L))
Using the structure shared by #ThomasIsCoding, and assuming the year is constant, one could just look for the row with the max month and filter for that:
# using datatable
library(data.table)
setDT(df)[month(as.IDate(Date)) == max(month(as.IDate(Date)))]
V1 V2 Date
1: E F 2020-09-10
2: G H 2020-09-15
I have a dataframe in which I have to create a new column based on the difference of two dates. Example:
Col1 Col2 Col3 Date New_Column_Required
A X A 01/01/2001 Wave1
B Y Q 01/01/2001 Wave1
C Z N 01/01/2001 Wave1
D W M 02/01/2001 Wave2
E Q V 02/01/2001 Wave2
F R O 03/01/2001 Wave3
G S T 03/01/2001 Wave3
2nd date - 1st date should be wave 1, 3rd date - 2nd date Wave 2 and so on. The problem I'm facing is because of the multiple dates can't seem to figure out.
Using dplyr we could change Date to class Date, arrange them based on Date and subtract Date from the first value.
library(dplyr)
df %>%
mutate(Date = lubridate::dmy(Date)) %>%
arrange(Date) %>%
mutate(new_col = paste0("Wave", Date - first(Date) + 1))
#OR
#mutate(new_col = paste0("Wave", as.integer(as.factor(Date))))
# Col1 Col2 Col3 Date new_col
#1 A X A 2001-01-01 Wave1
#2 B Y Q 2001-01-01 Wave1
#3 C Z N 2001-01-01 Wave1
#4 D W M 2001-01-02 Wave2
#5 E Q V 2001-01-02 Wave2
#6 F R O 2001-01-03 Wave3
#7 G S T 2001-01-03 Wave3
And the same logic in base R :
df$Date = as.Date(df$Date, "%d/%m/%Y")
df <- df[order(df$Date), ]
transform(df, new_col = paste0('Wave', Date - Date[1] + 1))
data
df <- structure(list(Col1 = c("A", "B", "C", "D", "E", "F", "G"), Col2 = c("X",
"Y", "Z", "W", "Q", "R", "S"), Col3 = c("A", "Q", "N", "M", "V",
"O", "T"), Date = c("01/01/2001", "01/01/2001", "01/01/2001",
"02/01/2001", "02/01/2001", "03/01/2001", "03/01/2001")), row.names = c(NA,
-7L), class = "data.frame")