df <- structure(list(ID = c(2L, 2L, 13L, 13L, 13L, 21L, 21L, 21L, 24L,
24L, 24L, 24L), mignr = c(1L, 0L, 1L, 2L, 0L, 0L, 2L, 1L, 2L,
3L, 0L, 1L), start = c(1387L, 903L, 1357L, 1391L, 1087L, 936L,
1367L, 1354L, 1363L, 1392L, 908L, 1361L), end = c(1401L, 1386L,
1390L, 1401L, 1356L, 1353L, 1399L, 1366L, 1391L, 1400L, 1360L,
1362L), staytime = c(15L, 484L, 34L, 11L, 270L, 418L, 33L, 13L,
29L, 9L, 453L, 2L)), row.names = c(NA, -12L), class = "data.frame")
My objective is to reset the sequences in this spell dataset so that they all restart at zero and for mignr greater than one increase by staytime. The data is grouped by IDs and mignr represents the sequence in the spell. I have set start == 0 end == 1 for the first spell (mignr == 0) and would like for each subsequent spell that
start is the previous end + 1
and end is start + (staytime - 1)
df <- df %>%
mutate(start = ifelse(mignr == 0, 0, start)) %>%
mutate(end = ifelse(mignr == 0, 1, end))
max_spell <- max(df$mignr)
for(i in seq_along(1:max_spell)){
j <- i-1
df <- df %>%
mutate(start = ifelse(mignr == i, (df[which(df$ID == ID & df$mignr == j),4]), start)) %>%
mutate(end = ifelse(mignr == i, start + (staytime - 1), end))
}
df
My attempt seems to work for mignr = 1 the start and end values are both as I would expect. However for subsequent values of mignr (>1) the start value is no longer what I would expect it to be. The way I am currently calculating the start is not very clean, but I can't quite tell why for subsequent iterations it is breaking.
I would use cumsum() on staytime, but for this to work, I'd change the first staytime of mignr == 0 to 1, since that is basically what setting the first start and end to 0 and 1 implies. Would that be acceptable?
library(tidyverse)
df %>%
group_by(ID) %>%
arrange(mignr, ,.by_group = TRUE) %>%
mutate(start = (row_number()- 1L),
staytime = ifelse(row_number() == 1, 1, staytime),
end = cumsum(staytime),
start = lag(end + 1, default = first(start)))
#> # A tibble: 12 x 5
#> # Groups: ID [4]
#> ID mignr start end staytime
#> <int> <int> <dbl> <dbl> <dbl>
#> 1 2 0 0 1 1
#> 2 2 1 2 16 15
#> 3 13 0 0 1 1
#> 4 13 1 2 35 34
#> 5 13 2 36 46 11
#> 6 21 0 0 1 1
#> 7 21 1 2 14 13
#> 8 21 2 15 47 33
#> 9 24 0 0 1 1
#> 10 24 1 2 3 2
#> 11 24 2 4 32 29
#> 12 24 3 33 41 9
Data from OP
df <- structure(list(ID = c(2L, 2L, 13L, 13L, 13L, 21L, 21L, 21L, 24L,
24L, 24L, 24L), mignr = c(1L, 0L, 1L, 2L, 0L, 0L, 2L, 1L, 2L,
3L, 0L, 1L), start = c(1387L, 903L, 1357L, 1391L, 1087L, 936L,
1367L, 1354L, 1363L, 1392L, 908L, 1361L), end = c(1401L, 1386L,
1390L, 1401L, 1356L, 1353L, 1399L, 1366L, 1391L, 1400L, 1360L,
1362L), staytime = c(15L, 484L, 34L, 11L, 270L, 418L, 33L, 13L,
29L, 9L, 453L, 2L)), row.names = c(NA, -12L), class = "data.frame")
Created on 2023-02-20 by the reprex package (v2.0.1)
Related
Here is a sample of my data:
kod <- structure(list(ID_WORKES = c(28029571L, 28029571L, 28029571L,
28029571L, 28029571L, 28029571L, 28029571L, 28029571L, 28029571L
), TABL_NOM = c(9716L, 9716L, 9716L, 9716L, 9716L, 9716L, 9716L,
9716L, 9716L), NAME = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L), .Label = "Dim", class = "factor"), ID_SP_NAR = c(20L,
20L, 20L, 30L, 30L, 30L, 30L, 30L, 30L), KOD_DOR = c(28L, 28L,
28L, 28L, 28L, 28L, 28L, 28L, 28L), KOD_DEPO = c(9167L, 9167L,
9167L, 9167L, 9167L, 9167L, 9167L, 9167L, 9167L), COLUMN_MASH = c(13L,
13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L), prop_violations = c(0.00561797752808989,
0.00293255131964809, 0.00495049504950495, 0.00215982721382289,
0.0120481927710843, 0.00561797752808989, 0.00293255131964809,
0.00591715976331361, 0.00495049504950495), mash_score = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L)), row.names = c(NA, -9L), class = "data.frame")
W
hat I would like to achieve is the following:
for each group formed by the columns ID_WORKES, TABL_NOM, NAME, KOD_DOR and KOD_DEPO, I would like to have a unique value in ID_SP_NAR.
For example here we have six rows where ID_SP_NAR == 30 with different values for prop_violations.
In this case I would like to summarise these six rows such that the remaining value for prop_violations equals the mean for these six rows.
The desired output then would look like:
ID_WORKES TABL_NOM NAME KOD_DOR KOD_DEPO ID_SP_NAR prop_violations mash_score
1 28029571 9716 Dim 28 9167 20 0.004500341 0
2 28029571 9716 Dim 28 9167 30 0.005604367 0
But there is one more thing: if for some duplicate values in prop_violations for ID_SP_NAR, mash_ score has value >0 then left last value for which mash_score has value>0
for example.
ID_WORKES TABL_NOM NAME ID_SP_NAR KOD_DOR KOD_DEPO COLUMN_MASH prop_violations mash_score
1 28029571 9716 Dim 30 28 9167 13 0,0056 0
2 28029571 9716 Dim 30 28 9167 13 0,012048193 0
3 28029571 9716 Dim 30 28 9167 13 0,005617978 0
4 28029571 9716 Dim 30 28 9167 13 0,002932551 1
5 28029571 9716 Dim 30 28 9167 13 0,00591716 0
6 28029571 9716 Dim 30 28 9167 13 0,004950495 0
in such case will left only value 0,002932551 by prop_violation for ID_SP_NAR=30,cause mash_score>0
How reach this conditions?
An option using data.table:
setDT(kod)
kod[, {
if(any(mash_score)>0) {
i <- which(mash_score>0)[1L]
.(prop_violations=prop_violations[i], mash_score=mash_score[i])
} else
.(prop_violations=mean(prop_violations), mash_score=mash_score[1L])
},
.(ID_WORKES, TABL_NOM, NAME, KOD_DOR, KOD_DEPO, ID_SP_NAR)]
output:
ID_WORKES TABL_NOM NAME KOD_DOR KOD_DEPO ID_SP_NAR prop_violations mash_score
1: 28029571 9716 Dim 28 9167 20 0.004500341 0
2: 28029571 9716 Dim 28 9167 30 0.002932551 1
data:
kod <- structure(list(ID_WORKES = c(28029571L, 28029571L, 28029571L,
28029571L, 28029571L, 28029571L, 28029571L, 28029571L, 28029571L
), TABL_NOM = c(9716L, 9716L, 9716L, 9716L, 9716L, 9716L, 9716L,
9716L, 9716L), NAME = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L), .Label = "Dim", class = "factor"), ID_SP_NAR = c(20L,
20L, 20L, 30L, 30L, 30L, 30L, 30L, 30L), KOD_DOR = c(28L, 28L,
28L, 28L, 28L, 28L, 28L, 28L, 28L), KOD_DEPO = c(9167L, 9167L,
9167L, 9167L, 9167L, 9167L, 9167L, 9167L, 9167L), COLUMN_MASH = c(13L,
13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L), prop_violations = c(0.00561797752808989,
0.00293255131964809, 0.00495049504950495, 0.00215982721382289,
0.0120481927710843, 0.00561797752808989, 0.00293255131964809,
0.00591715976331361, 0.00495049504950495), mash_score = c(0L,
0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L)), row.names = c(NA, -9L), class = "data.frame")
here is a solution using the tidyverse package:
kod %>%
group_by(ID_WORKES, TABL_NOM, NAME, KOD_DOR, KOD_DEPO, ID_SP_NAR) %>%
summarise(prop_violations = if (all(mash_score == 0)) mean(prop_violations) else last(prop_violations[mash_score > 0]))
If for a specific group all mash_score are equal to zero the average is returned (using mean). If at least one mash_score is larger than zero then the last value of prop_violations for which mash_score > 0 is returned (using dplyr::last).
I am interested in figuring out the frequency of unemployed people, who are also African-American/Black in my data set. I have a large data set, that includes the variables OCC (unemployed individuals are coded as 0), and Race (AA/Black is coded as 2).
I tried to use the group(by) function thru tidyverse, but I think I may be doing this incorrectly, because I get the following error messages.
This is the code:
RACE <- group_by(cps_data, OCC, RACE)
occupation <- summarise(RACE,
count = n(),
OCC = mean(OCC, na.rm = TRUE)
)
summarise(RACE, occupation = mean(OCC, na.rm = TRUE))
The occupation object that I created gives me the error message:
Error in summarise_impl(.data, dots) :
Column `OCC` can't be modified because it's a grouping variable
The summarise function gives me a tibble that doesn't seem very helpful:
# A tibble: 1,374 x 3
# Groups: OCC [?]
OCC RACE occupation
<int> <int> <dbl>
1 0 1 0
2 0 2 0
3 0 3 0
4 0 4 0
5 0 5 0
6 0 6 0
7 0 7 0
8 0 8 0
9 0 9 0
10 10 1 10
Here is some of my data - I tried to reproduce for you guys to help you assist me. You will see that above I made another data frame to only include OCC and RACE, as those are the only relevant factors at the moment.
dput(head(cps_data,4))
structure(list(YEAR = c(2015L, 2015L, 2015L, 2015L), DATANUM = c(1L,
1L, 1L, 1L), SERIAL = c(1029644L, 1029644L, 1029705L, 1029708L
), CBSERIAL = c(403, 403, 1944, 1964), HHWT = c(194L, 194L, 142L,
77L), STATEICP = c(14L, 14L, 14L, 14L), STATEFIP = c(42L, 42L,
42L, 42L), CITY = c(5330L, 5330L, 5330L, 5330L), GQ = c(1L, 1L,
1L, 1L), PERNUM = c(1L, 3L, 1L, 1L), PERWT = c(194L, 140L, 142L,
78L), SEX = c(2L, 1L, 2L, 1L), AGE = c(37L, 35L, 60L, 41L), RACE = c(1L,
1L, 2L, 2L), RACED = c(100L, 100L, 200L, 200L), OCC = c(800L,
6260L, 0L, 350L), IND = c(7270L, 770L, 0L, 8190L), INCWAGE = c(75000L,
25000L, 0L, 83000L)), row.names = c(NA, 4L), class = "data.frame")
I am looking to get an output that shows me the number of individuals unemployed, who also identify as African-American/Black, so I can compare across my data set.
If I understood you correctly, you were almost there.
df %>%
group_by(OCC, RACE) %>%
summarize(count = n())
# A tibble: 4 x 3
# Groups: OCC [4]
OCC RACE count
<int> <int> <int>
1 0 2 1
2 350 2 1
3 800 1 1
4 6260 1 1
Data
library(tidyverse)
df <- structure(list(YEAR = c(2015L, 2015L, 2015L, 2015L), DATANUM = c(1L,
1L, 1L, 1L), SERIAL = c(1029644L, 1029644L, 1029705L, 1029708L
), CBSERIAL = c(403, 403, 1944, 1964), HHWT = c(194L, 194L, 142L,
77L), STATEICP = c(14L, 14L, 14L, 14L), STATEFIP = c(42L, 42L,
42L, 42L), CITY = c(5330L, 5330L, 5330L, 5330L), GQ = c(1L, 1L,
1L, 1L), PERNUM = c(1L, 3L, 1L, 1L), PERWT = c(194L, 140L, 142L,
78L), SEX = c(2L, 1L, 2L, 1L), AGE = c(37L, 35L, 60L, 41L), RACE = c(1L,
1L, 2L, 2L), RACED = c(100L, 100L, 200L, 200L), OCC = c(800L,
6260L, 0L, 350L), IND = c(7270L, 770L, 0L, 8190L), INCWAGE = c(75000L,
25000L, 0L, 83000L)), row.names = c(NA, 4L), class = "data.frame")
My dataset has the next structure
df=structure(list(Data = structure(c(12L, 13L, 14L, 15L, 16L, 17L,
18L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L), .Label = c("01.01.2018",
"02.01.2018", "03.01.2018", "04.01.2018", "05.01.2018", "06.01.2018",
"07.01.2018", "12.02.2018", "13.02.2018", "14.02.2018", "15.02.2018",
"25.12.2017", "26.12.2017", "27.12.2017", "28.12.2017", "29.12.2017",
"30.12.2017", "31.12.2017"), class = "factor"), sku = 1:18, metric = c(100L,
210L, 320L, 430L, 540L, 650L, 760L, 870L, 980L, 1090L, 1200L,
1310L, 1420L, 1530L, 1640L, 1750L, 1860L, 1970L), action = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L)), .Names = c("Data", "sku", "metric", "action"), class = "data.frame", row.names = c(NA,
-18L))
I need to delete observations that have certain dates.
But in this dataset there is action variable. The action column has only two values 0 and 1.
Observations on these certain dates should be deleted only for the zero category of action.
these dates are presented in a separate datase.
datedata=structure(list(Data = structure(c(18L, 19L, 20L, 21L, 22L, 5L,
7L, 9L, 11L, 13L, 15L, 17L, 23L, 1L, 2L, 3L, 4L, 6L, 8L, 10L,
12L, 14L, 16L), .Label = c("01.05.2018", "02.05.2018", "03.05.2018",
"04.05.2018", "05.03.2018", "05.05.2018", "06.03.2018", "06.05.2018",
"07.03.2018", "07.05.2018", "08.03.2018", "08.05.2018", "09.03.2018",
"09.05.2018", "10.03.2018", "10.05.2018", "11.03.2018", "21.02.2018",
"22.02.2018", "23.02.2018", "24.02.2018", "25.02.2018", "30.04.2018"
), class = "factor")), .Names = "Data", class = "data.frame", row.names = c(NA,
-23L))
how can i do it?
A solution is to use dplyr::filter as:
library(dplyr)
library(lubridate)
df %>% mutate(Data = dmy(Data)) %>%
filter(action==1 | (action==0 & !(Data %in% dmy(datedata$Data))))
# Data sku metric action
# 1 2017-12-25 1 100 0
# 2 2017-12-26 2 210 0
# 3 2017-12-27 3 320 0
# 4 2017-12-28 4 430 0
# 5 2017-12-29 5 540 0
# 6 2017-12-30 6 650 0
# 7 2017-12-31 7 760 0
# 8 2018-01-01 8 870 0
# 9 2018-01-02 9 980 1
# 10 2018-01-03 10 1090 1
# 11 2018-01-04 11 1200 1
# 12 2018-01-05 12 1310 1
# 13 2018-01-06 13 1420 1
# 14 2018-01-07 14 1530 1
# 15 2018-02-12 15 1640 1
# 16 2018-02-13 16 1750 1
# 17 2018-02-14 17 1860 1
# 18 2018-02-15 18 1970 1
I guess this will work. Fist use match to see weather there is a match in the day of df and the day in datedata, then filter it
library (dplyr)
df <- df %>% mutate (Data.flag = match(Data,datedata$Data)) %>%
filter(!is.na(Data.flag) & action == 0)
I have a dataset that looks like this:
A tibble: 935 x 17
wage hours iq kww educ exper tenure age married black south urban sibs brthord meduc
<int> <int> <int> <int> <int> <int> <int> <int> <fctr> <fctr> <fctr> <fctr> <int> <int> <int>
1 769 40 93 35 12 11 2 31 1 0 0 1 1 2 8
2 808 50 119 41 18 11 16 37 1 0 0 1 1 NA 14
3 825 40 108 46 14 11 9 33 1 0 0 1 1 2 14
4 650 40 96 32 12 13 7 32 1 0 0 1 4 3 12
5 562 40 74 27 11 14 5 34 1 0 0 1 10 6 6
6 1400 40 116 43 16 14 2 35 1 1 0 1 1 2 8
7 600 40 91 24 10 13 0 30 0 0 0 1 1 2 8
8 1081 40 114 50 18 8 14 38 1 0 0 1 2 3 8
9 1154 45 111 37 15 13 1 36 1 0 0 0 2 3 14
10 1000 40 95 44 12 16 16 36 1 0 0 1 1 1 12
...
What can I run to see the distribution of wage (the first column). Specifically, I want to see how many people have a wage of under $300.
What ggplot function can I run?
You can get the cumulative histogram:
library(ggplot2)
ggplot(df,aes(wage))+geom_histogram(aes(y=cumsum(..count..)))+
stat_bin(aes(y=cumsum(..count..)),geom="line",color="green")
If you specifically want to know the count of entries with a certain condition, in base r you can use the following:
count(df[df$wage > 1000,])
## # A tibble: 1 x 1
## n
## <int>
## 1 3
Data:
df <- structure(list(wage = c(769L, 808L, 825L, 650L, 562L, 1400L,
600L, 1081L, 1154L, 1000L), hours = c(40L, 50L, 40L, 40L, 40L,
40L, 40L, 40L, 45L, 40L), iq = c(93L, 119L, 108L, 96L, 74L, 116L,
91L, 114L, 111L, 95L), kww = c(35L, 41L, 46L, 32L, 27L, 43L,
24L, 50L, 37L, 44L), educ = c(12L, 18L, 14L, 12L, 11L, 16L, 10L,
18L, 15L, 12L), exper = c(11L, 11L, 11L, 13L, 14L, 14L, 13L,
8L, 13L, 16L), tenure = c(2L, 16L, 9L, 7L, 5L, 2L, 0L, 14L, 1L,
16L), age = c(31L, 37L, 33L, 32L, 34L, 35L, 30L, 38L, 36L, 36L
), married = c(1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L), black = c(0L,
0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L), south = c(0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L), urban = c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 0L, 1L), sibs = c(1L, 1L, 1L, 4L, 10L, 1L, 1L, 2L, 2L, 1L
), brthord = c(2L, NA, 2L, 3L, 6L, 2L, 2L, 3L, 3L, 1L), meduc = c(8L,
14L, 14L, 12L, 6L, 8L, 8L, 8L, 14L, 12L)), .Names = c("wage",
"hours", "iq", "kww", "educ", "exper", "tenure", "age", "married",
"black", "south", "urban", "sibs", "brthord", "meduc"), row.names = c(NA,
-10L), class = c("tbl_df", "tbl", "data.frame"))
Try this:
library(dplyr)
library(ggplot2)
df <- df %>% filter(wage < 300)
qplot(wage, data = df)
Here is a sample of the data I'm currently working on:
x <- structure(list(sec = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
min = c(59L, 32L, 23L, 26L, 20L, 14L, 1L, 5L, 32L, 2L),
hour = c(10L, 15L, 12L, 12L, 16L, 18L, 18L, 9L, 14L, 12L),
mday = c(9L, 15L, 2L, 15L, 20L, 26L, 11L, 22L, 9L, 16L),
mon = c(4L, 11L, 10L, 7L, 9L, 8L, 10L, 8L, 8L, 4L),
year = c(111L, 111L, 111L, 111L, 111L, 111L, 111L, 111L, 111L, 111L),
wday = c(1L, 4L, 3L, 1L, 4L, 1L, 5L, 4L, 5L, 1L),
yday = c(128L, 348L, 305L, 226L, 292L, 268L, 314L, 264L, 251L, 135L),
isdst = c(0L, 1L, 1L, 0L, 1L, 0L, 1L, 0L, 0L, 0L)),
.Names = c("sec", "min", "hour", "mday", "mon", "year",
"wday", "yday", "isdst"),
class = c("POSIXlt", "POSIXt"))
So that
> x
[1] "2011-05-09 10:59:00" "2011-12-15 15:32:00" "2011-11-02 12:23:00"
[4] "2011-08-15 12:26:00" "2011-10-20 16:20:00" "2011-09-26 18:14:00"
[7] "2011-11-11 18:01:00" "2011-09-22 09:05:00" "2011-09-09 14:32:00"
[10] "2011-05-16 12:02:00"
Say I want to tabulate the distribution of x by month. This is how I accomplish it:
> table(strftime(x, '%m'))
05 08 09 10 11 12
2 1 3 1 2 1
Now I want to do a similar tabulation, but this time I want to group the data by bimester (and possibly by trimester or semester, later on). I've taken a look at the help page for strptime, but couldn't find an appropriate separator.
This is the best I have come up with so far:
> table(cut(x = as.numeric(strftime(x, '%m')),
breaks = c(1, 3, 5, 7, 9, 11, 13),
labels = c('1-2', '3-4', '5-6', '7-8', '9-10', '11-12'),
right = FALSE))
1-2 3-4 5-6 7-8 9-10 11-12
0 0 2 1 4 3
It is a convoluted way of reaching this, but it's OK for a simple example and a single case. However, this approach will give me headaches down the road, since I'll want those data to remain POSIX (not to mention it makes my code scarier than it should). Is there an elegant solution for this?
If you're sticking with table and vectors (as opposed to have a rectangular data/output, in which case I'd use data.table), you could do:
table(2*(x$mon %/% 2) + 1)
#
# 5 7 9 11
# 2 1 4 3
You could do away with using any type of format-ting of the date values themselves and just create a lookup vector for your groupings. This would also allow total flexibility in specifying what months fit into what categories. E.g.:
src <- factor(rep(c('01-02','03-04','05-06','07-08','09-10','11-12'),each=2))
src[x$mon+1]
#[1] 05-06 11-12 11-12 07-08 09-10 09-10 11-12 09-10 09-10 05-06
#Levels: 01-02 03-04 05-06 07-08 09-10 11-12
table(src[x$mon+1])
#01-02 03-04 05-06 07-08 09-10 11-12
# 0 0 2 1 4 3