removing duplicates with aggregated groups in R - r

Here is a sample of my data:
kod <- structure(list(ID_WORKES = c(28029571L, 28029571L, 28029571L,
28029571L, 28029571L, 28029571L, 28029571L, 28029571L, 28029571L
), TABL_NOM = c(9716L, 9716L, 9716L, 9716L, 9716L, 9716L, 9716L,
9716L, 9716L), NAME = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L), .Label = "Dim", class = "factor"), ID_SP_NAR = c(20L,
20L, 20L, 30L, 30L, 30L, 30L, 30L, 30L), KOD_DOR = c(28L, 28L,
28L, 28L, 28L, 28L, 28L, 28L, 28L), KOD_DEPO = c(9167L, 9167L,
9167L, 9167L, 9167L, 9167L, 9167L, 9167L, 9167L), COLUMN_MASH = c(13L,
13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L), prop_violations = c(0.00561797752808989,
0.00293255131964809, 0.00495049504950495, 0.00215982721382289,
0.0120481927710843, 0.00561797752808989, 0.00293255131964809,
0.00591715976331361, 0.00495049504950495), mash_score = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L)), row.names = c(NA, -9L), class = "data.frame")
W
hat I would like to achieve is the following:
for each group formed by the columns ID_WORKES, TABL_NOM, NAME, KOD_DOR and KOD_DEPO, I would like to have a unique value in ID_SP_NAR.
For example here we have six rows where ID_SP_NAR == 30 with different values for prop_violations.
In this case I would like to summarise these six rows such that the remaining value for prop_violations equals the mean for these six rows.
The desired output then would look like:
ID_WORKES TABL_NOM NAME KOD_DOR KOD_DEPO ID_SP_NAR prop_violations mash_score
1 28029571 9716 Dim 28 9167 20 0.004500341 0
2 28029571 9716 Dim 28 9167 30 0.005604367 0
But there is one more thing: if for some duplicate values in prop_violations for ID_SP_NAR, mash_ score has value >0 then left last value for which mash_score has value>0
for example.
ID_WORKES TABL_NOM NAME ID_SP_NAR KOD_DOR KOD_DEPO COLUMN_MASH prop_violations mash_score
1 28029571 9716 Dim 30 28 9167 13 0,0056 0
2 28029571 9716 Dim 30 28 9167 13 0,012048193 0
3 28029571 9716 Dim 30 28 9167 13 0,005617978 0
4 28029571 9716 Dim 30 28 9167 13 0,002932551 1
5 28029571 9716 Dim 30 28 9167 13 0,00591716 0
6 28029571 9716 Dim 30 28 9167 13 0,004950495 0
in such case will left only value 0,002932551 by prop_violation for ID_SP_NAR=30,cause mash_score>0
How reach this conditions?

An option using data.table:
setDT(kod)
kod[, {
if(any(mash_score)>0) {
i <- which(mash_score>0)[1L]
.(prop_violations=prop_violations[i], mash_score=mash_score[i])
} else
.(prop_violations=mean(prop_violations), mash_score=mash_score[1L])
},
.(ID_WORKES, TABL_NOM, NAME, KOD_DOR, KOD_DEPO, ID_SP_NAR)]
output:
ID_WORKES TABL_NOM NAME KOD_DOR KOD_DEPO ID_SP_NAR prop_violations mash_score
1: 28029571 9716 Dim 28 9167 20 0.004500341 0
2: 28029571 9716 Dim 28 9167 30 0.002932551 1
data:
kod <- structure(list(ID_WORKES = c(28029571L, 28029571L, 28029571L,
28029571L, 28029571L, 28029571L, 28029571L, 28029571L, 28029571L
), TABL_NOM = c(9716L, 9716L, 9716L, 9716L, 9716L, 9716L, 9716L,
9716L, 9716L), NAME = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L), .Label = "Dim", class = "factor"), ID_SP_NAR = c(20L,
20L, 20L, 30L, 30L, 30L, 30L, 30L, 30L), KOD_DOR = c(28L, 28L,
28L, 28L, 28L, 28L, 28L, 28L, 28L), KOD_DEPO = c(9167L, 9167L,
9167L, 9167L, 9167L, 9167L, 9167L, 9167L, 9167L), COLUMN_MASH = c(13L,
13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L), prop_violations = c(0.00561797752808989,
0.00293255131964809, 0.00495049504950495, 0.00215982721382289,
0.0120481927710843, 0.00561797752808989, 0.00293255131964809,
0.00591715976331361, 0.00495049504950495), mash_score = c(0L,
0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L)), row.names = c(NA, -9L), class = "data.frame")

here is a solution using the tidyverse package:
kod %>%
group_by(ID_WORKES, TABL_NOM, NAME, KOD_DOR, KOD_DEPO, ID_SP_NAR) %>%
summarise(prop_violations = if (all(mash_score == 0)) mean(prop_violations) else last(prop_violations[mash_score > 0]))
If for a specific group all mash_score are equal to zero the average is returned (using mean). If at least one mash_score is larger than zero then the last value of prop_violations for which mash_score > 0 is returned (using dplyr::last).

Related

Renumber sequence in spell dataset

df <- structure(list(ID = c(2L, 2L, 13L, 13L, 13L, 21L, 21L, 21L, 24L,
24L, 24L, 24L), mignr = c(1L, 0L, 1L, 2L, 0L, 0L, 2L, 1L, 2L,
3L, 0L, 1L), start = c(1387L, 903L, 1357L, 1391L, 1087L, 936L,
1367L, 1354L, 1363L, 1392L, 908L, 1361L), end = c(1401L, 1386L,
1390L, 1401L, 1356L, 1353L, 1399L, 1366L, 1391L, 1400L, 1360L,
1362L), staytime = c(15L, 484L, 34L, 11L, 270L, 418L, 33L, 13L,
29L, 9L, 453L, 2L)), row.names = c(NA, -12L), class = "data.frame")
My objective is to reset the sequences in this spell dataset so that they all restart at zero and for mignr greater than one increase by staytime. The data is grouped by IDs and mignr represents the sequence in the spell. I have set start == 0 end == 1 for the first spell (mignr == 0) and would like for each subsequent spell that
start is the previous end + 1
and end is start + (staytime - 1)
df <- df %>%
mutate(start = ifelse(mignr == 0, 0, start)) %>%
mutate(end = ifelse(mignr == 0, 1, end))
max_spell <- max(df$mignr)
for(i in seq_along(1:max_spell)){
j <- i-1
df <- df %>%
mutate(start = ifelse(mignr == i, (df[which(df$ID == ID & df$mignr == j),4]), start)) %>%
mutate(end = ifelse(mignr == i, start + (staytime - 1), end))
}
df
My attempt seems to work for mignr = 1 the start and end values are both as I would expect. However for subsequent values of mignr (>1) the start value is no longer what I would expect it to be. The way I am currently calculating the start is not very clean, but I can't quite tell why for subsequent iterations it is breaking.
I would use cumsum() on staytime, but for this to work, I'd change the first staytime of mignr == 0 to 1, since that is basically what setting the first start and end to 0 and 1 implies. Would that be acceptable?
library(tidyverse)
df %>%
group_by(ID) %>%
arrange(mignr, ,.by_group = TRUE) %>%
mutate(start = (row_number()- 1L),
staytime = ifelse(row_number() == 1, 1, staytime),
end = cumsum(staytime),
start = lag(end + 1, default = first(start)))
#> # A tibble: 12 x 5
#> # Groups: ID [4]
#> ID mignr start end staytime
#> <int> <int> <dbl> <dbl> <dbl>
#> 1 2 0 0 1 1
#> 2 2 1 2 16 15
#> 3 13 0 0 1 1
#> 4 13 1 2 35 34
#> 5 13 2 36 46 11
#> 6 21 0 0 1 1
#> 7 21 1 2 14 13
#> 8 21 2 15 47 33
#> 9 24 0 0 1 1
#> 10 24 1 2 3 2
#> 11 24 2 4 32 29
#> 12 24 3 33 41 9
Data from OP
df <- structure(list(ID = c(2L, 2L, 13L, 13L, 13L, 21L, 21L, 21L, 24L,
24L, 24L, 24L), mignr = c(1L, 0L, 1L, 2L, 0L, 0L, 2L, 1L, 2L,
3L, 0L, 1L), start = c(1387L, 903L, 1357L, 1391L, 1087L, 936L,
1367L, 1354L, 1363L, 1392L, 908L, 1361L), end = c(1401L, 1386L,
1390L, 1401L, 1356L, 1353L, 1399L, 1366L, 1391L, 1400L, 1360L,
1362L), staytime = c(15L, 484L, 34L, 11L, 270L, 418L, 33L, 13L,
29L, 9L, 453L, 2L)), row.names = c(NA, -12L), class = "data.frame")
Created on 2023-02-20 by the reprex package (v2.0.1)

delete observations by days in R

My dataset has the next structure
df=structure(list(Data = structure(c(12L, 13L, 14L, 15L, 16L, 17L,
18L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L), .Label = c("01.01.2018",
"02.01.2018", "03.01.2018", "04.01.2018", "05.01.2018", "06.01.2018",
"07.01.2018", "12.02.2018", "13.02.2018", "14.02.2018", "15.02.2018",
"25.12.2017", "26.12.2017", "27.12.2017", "28.12.2017", "29.12.2017",
"30.12.2017", "31.12.2017"), class = "factor"), sku = 1:18, metric = c(100L,
210L, 320L, 430L, 540L, 650L, 760L, 870L, 980L, 1090L, 1200L,
1310L, 1420L, 1530L, 1640L, 1750L, 1860L, 1970L), action = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L)), .Names = c("Data", "sku", "metric", "action"), class = "data.frame", row.names = c(NA,
-18L))
I need to delete observations that have certain dates.
But in this dataset there is action variable. The action column has only two values 0 and 1.
Observations on these certain dates should be deleted only for the zero category of action.
these dates are presented in a separate datase.
datedata=structure(list(Data = structure(c(18L, 19L, 20L, 21L, 22L, 5L,
7L, 9L, 11L, 13L, 15L, 17L, 23L, 1L, 2L, 3L, 4L, 6L, 8L, 10L,
12L, 14L, 16L), .Label = c("01.05.2018", "02.05.2018", "03.05.2018",
"04.05.2018", "05.03.2018", "05.05.2018", "06.03.2018", "06.05.2018",
"07.03.2018", "07.05.2018", "08.03.2018", "08.05.2018", "09.03.2018",
"09.05.2018", "10.03.2018", "10.05.2018", "11.03.2018", "21.02.2018",
"22.02.2018", "23.02.2018", "24.02.2018", "25.02.2018", "30.04.2018"
), class = "factor")), .Names = "Data", class = "data.frame", row.names = c(NA,
-23L))
how can i do it?
A solution is to use dplyr::filter as:
library(dplyr)
library(lubridate)
df %>% mutate(Data = dmy(Data)) %>%
filter(action==1 | (action==0 & !(Data %in% dmy(datedata$Data))))
# Data sku metric action
# 1 2017-12-25 1 100 0
# 2 2017-12-26 2 210 0
# 3 2017-12-27 3 320 0
# 4 2017-12-28 4 430 0
# 5 2017-12-29 5 540 0
# 6 2017-12-30 6 650 0
# 7 2017-12-31 7 760 0
# 8 2018-01-01 8 870 0
# 9 2018-01-02 9 980 1
# 10 2018-01-03 10 1090 1
# 11 2018-01-04 11 1200 1
# 12 2018-01-05 12 1310 1
# 13 2018-01-06 13 1420 1
# 14 2018-01-07 14 1530 1
# 15 2018-02-12 15 1640 1
# 16 2018-02-13 16 1750 1
# 17 2018-02-14 17 1860 1
# 18 2018-02-15 18 1970 1
I guess this will work. Fist use match to see weather there is a match in the day of df and the day in datedata, then filter it
library (dplyr)
df <- df %>% mutate (Data.flag = match(Data,datedata$Data)) %>%
filter(!is.na(Data.flag) & action == 0)

Weekday Factor off by 8 hours

I'm trying to add a factor to label the day of the week in my data, but the new "Weekday" column seems to be off by 8 hours. The "DateTime" field has the form '%Y-%m-%d %H:%M:%S', tz='GMT'. I've supplied sample data below, along with my code. Does see what I'm doing wrong and how to fix it?
Code:
##Weekday variables
WeekdayVar<-icartdffill
WeekdayVar$Weekday<-as.factor(weekdays(as.Date(WeekdayVar$DateTime, "%Y-%m-%d", tz="GMT")))
Sample Data:
dput(droplevels(icartdffill[1:50,]))
structure(list(OrderCnt = c(1L, 1L, 0L, 0L, 0L, 2L, 5L, 12L,
16L, 30L, 27L, 21L, 23L, 27L, 37L, 36L, 35L, 30L, 27L, 17L, 8L,
2L, 1L, 0L, 0L, 0L, 0L, 0L, 1L, 2L, 6L, 9L, 19L, 28L, 27L, 28L,
22L, 29L, 41L, 35L, 43L, 42L, 27L, 16L, 10L, 2L, 1L, 0L, 0L,
0L), DateTime = structure(c(1398931200, 1398934800, 1398938400,
1398942000, 1398945600, 1398949200, 1398952800, 1398956400, 1398960000,
1398963600, 1398967200, 1398970800, 1398974400, 1398978000, 1398981600,
1398985200, 1398988800, 1398992400, 1398996000, 1398999600, 1399003200,
1399006800, 1399010400, 1399014000, 1399017600, 1399021200, 1399024800,
1399028400, 1399032000, 1399035600, 1399039200, 1399042800, 1399046400,
1399050000, 1399053600, 1399057200, 1399060800, 1399064400, 1399068000,
1399071600, 1399075200, 1399078800, 1399082400, 1399086000, 1399089600,
1399093200, 1399096800, 1399100400, 1399104000, 1399107600), class = c("POSIXct",
"POSIXt"))), .Names = c("OrderCnt", "DateTime"), row.names = c(NA,
50L), class = "data.frame")

How to see the distribution of a column in R?

I have a dataset that looks like this:
A tibble: 935 x 17
wage hours iq kww educ exper tenure age married black south urban sibs brthord meduc
<int> <int> <int> <int> <int> <int> <int> <int> <fctr> <fctr> <fctr> <fctr> <int> <int> <int>
1 769 40 93 35 12 11 2 31 1 0 0 1 1 2 8
2 808 50 119 41 18 11 16 37 1 0 0 1 1 NA 14
3 825 40 108 46 14 11 9 33 1 0 0 1 1 2 14
4 650 40 96 32 12 13 7 32 1 0 0 1 4 3 12
5 562 40 74 27 11 14 5 34 1 0 0 1 10 6 6
6 1400 40 116 43 16 14 2 35 1 1 0 1 1 2 8
7 600 40 91 24 10 13 0 30 0 0 0 1 1 2 8
8 1081 40 114 50 18 8 14 38 1 0 0 1 2 3 8
9 1154 45 111 37 15 13 1 36 1 0 0 0 2 3 14
10 1000 40 95 44 12 16 16 36 1 0 0 1 1 1 12
...
What can I run to see the distribution of wage (the first column). Specifically, I want to see how many people have a wage of under $300.
What ggplot function can I run?
You can get the cumulative histogram:
library(ggplot2)
ggplot(df,aes(wage))+geom_histogram(aes(y=cumsum(..count..)))+
stat_bin(aes(y=cumsum(..count..)),geom="line",color="green")
If you specifically want to know the count of entries with a certain condition, in base r you can use the following:
count(df[df$wage > 1000,])
## # A tibble: 1 x 1
## n
## <int>
## 1 3
Data:
df <- structure(list(wage = c(769L, 808L, 825L, 650L, 562L, 1400L,
600L, 1081L, 1154L, 1000L), hours = c(40L, 50L, 40L, 40L, 40L,
40L, 40L, 40L, 45L, 40L), iq = c(93L, 119L, 108L, 96L, 74L, 116L,
91L, 114L, 111L, 95L), kww = c(35L, 41L, 46L, 32L, 27L, 43L,
24L, 50L, 37L, 44L), educ = c(12L, 18L, 14L, 12L, 11L, 16L, 10L,
18L, 15L, 12L), exper = c(11L, 11L, 11L, 13L, 14L, 14L, 13L,
8L, 13L, 16L), tenure = c(2L, 16L, 9L, 7L, 5L, 2L, 0L, 14L, 1L,
16L), age = c(31L, 37L, 33L, 32L, 34L, 35L, 30L, 38L, 36L, 36L
), married = c(1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L), black = c(0L,
0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L), south = c(0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L), urban = c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 0L, 1L), sibs = c(1L, 1L, 1L, 4L, 10L, 1L, 1L, 2L, 2L, 1L
), brthord = c(2L, NA, 2L, 3L, 6L, 2L, 2L, 3L, 3L, 1L), meduc = c(8L,
14L, 14L, 12L, 6L, 8L, 8L, 8L, 14L, 12L)), .Names = c("wage",
"hours", "iq", "kww", "educ", "exper", "tenure", "age", "married",
"black", "south", "urban", "sibs", "brthord", "meduc"), row.names = c(NA,
-10L), class = c("tbl_df", "tbl", "data.frame"))
Try this:
library(dplyr)
library(ggplot2)
df <- df %>% filter(wage < 300)
qplot(wage, data = df)

plotting x-axes with custom label in R

I've to plot these data:
day temperature
02/01/2012 13:30:00 10
10/01/2012 20:30:00 8
15/01/2012 13:30:00 12
25/01/2012 20:30:00 6
02/02/2012 13:30:00 5
10/02/2012 20:30:00 3
15/02/2012 13:30:00 6
25/02/2012 20:30:00 -1
02/03/2012 13:30:00 4
10/03/2012 20:30:00 -2
15/03/2012 13:30:00 7
25/03/2012 20:30:00 1
in the x-axis I want to label only the month and the day (e.g. Jan 02 ). How can I do this using the command plot() and axis()?
First, you will need to put your date text into a dtae class (e.g. as.POSIXct):
df <- structure(list(day = structure(list(sec = c(0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0), min = c(30L, 30L, 30L, 30L, 30L, 30L, 30L,
30L, 30L, 30L, 30L, 30L), hour = c(13L, 20L, 13L, 20L, 13L, 20L,
13L, 20L, 13L, 20L, 13L, 20L), mday = c(2L, 10L, 15L, 25L, 2L,
10L, 15L, 25L, 2L, 10L, 15L, 25L), mon = c(0L, 0L, 0L, 0L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L), year = c(112L, 112L, 112L, 112L,
112L, 112L, 112L, 112L, 112L, 112L, 112L, 112L), wday = c(1L,
2L, 0L, 3L, 4L, 5L, 3L, 6L, 5L, 6L, 4L, 0L), yday = c(1L, 9L,
14L, 24L, 32L, 40L, 45L, 55L, 61L, 69L, 74L, 84L), isdst = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L)), .Names = c("sec",
"min", "hour", "mday", "mon", "year", "wday", "yday", "isdst"
), class = c("POSIXlt", "POSIXt")), temperature = c(10L, 8L,
12L, 6L, 5L, 3L, 6L, -1L, 4L, -2L, 7L, 1L)), .Names = c("day",
"temperature"), row.names = c(NA, -12L), class = "data.frame")
df
df$day <- as.POSIXct(df$day, format="%d/%m/%Y %H:%M:%S")
Your dates should now plot correctly. Don't apply the x-axis, by using the argument xaxt="n". Afterwards, you can create a sequence of dates where you would like your axis labeled, and apply this with axis.POSIXct:
plot(df$day, df$temperature, t="l", ylab="Temperature", xlab="Date", xaxt="n")
SEQ <- seq(min(df$day), max(df$day), by="months")
axis.POSIXct(SEQ, at=SEQ, side=1, format="%b %Y")
Similarly, to get a daily axis, simply modify the SEQ and axis.POSIXct code accordingly. For example, you may try:
plot(df$day, df$temperature, t="l", ylab="Temperature", xlab="Date", xaxt="n")
SEQ <- seq(min(df$day), max(df$day), by="days")
axis.POSIXct(SEQ, at=SEQ, side=1, format="%b %d")

Resources