This question already has answers here:
Categorize numeric variable into group/ bins/ breaks
(4 answers)
Closed 2 years ago.
I'd like to plot a continuous vector as discrete values.
To do so, I'm trying to discretize a continuous vector by transforming it in factor range.
I'm trying to factorize a vector with doubles between 0 and 1.
I'm trying to do by using the cut function.
data:
structure(list(label = c("WP_078201646.1..87-312", "WP_077753210.1..91-300",
"WP_044287879.1..90-306", "WP_046711496.1..56-299", "WP_069060785.1..87-301",
"WP_011394873.1..91-301", "WP_015146987.1..159-358", "WP_085748967.1..86-314",
"NP_696283.1..85-318", "WP_011925568.1..89-315", "WP_013040867.1..89-307",
"WP_062116680.1..85-302", "WP_082057246.1..88-313", "WP_079078020.1..79-301",
"WP_043081767.1..100-292", "WP_085760186.1..96-309", "WP_052427986.1..92-305",
"WP_071039302.1..84-306", "WP_012939355.1..84-312", "WP_012630775.1..85-305"
), full = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L), e15 = c(2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), e20 = c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L), id_0cov_0.8evalue_0.001 = c(1L, 2L, 4L, 5L, 6L,
9L, 11L, 13L, 14L, 17L, 19L, 22L, 23L, 25L, 31L, 37L, 38L, 42L,
44L, 45L), `archConsensus1e-3` = c("LysR_substrate", "LysR_substrate",
"LysR_substrate", "LysR_substrate", "LysR_substrate", "LysR_substrate",
"PBP_like", "LysR_substrate", "LysR_substrate", "LysR_substrate",
"LysR_substrate", "LysR_substrate", "LysR_substrate", "LysR_substrate",
"LysR_substrate", "LysR_substrate", "LysR_substrate", "LysR_substrate",
"LysR_substrate", "LysR_substrate"), hhArch = c("LysR_substrate",
"LysR_substrate", "LysR_substrate", "LysR_substrate", "LysR_substrate",
"LysR_substrate", "PBP_like", "LysR_substrate", "LysR_substrate",
"LysR_substrate", "LysR_substrate", "LysR_substrate", "LysR_substrate",
"LysR_substrate", "LysR_substrate", "LysR_substrate", "LysR_substrate",
"LysR_substrate", "LysR_substrate", "LysR_substrate"), cache_rate = c(0.00383141762452107,
0, 0, 0.0123681338668607, 0.00512820512820513, 0.0254545454545455,
0.00940438871473354, 0, 0.0571428571428571, 0.00519930675909879,
0, 0.00363636363636364, 0.0357142857142857, 0, 0, 0, 0.0535714285714286,
0, 0.00393700787401575, 0), groupsize = c(261L, 28L, 351L, 2749L,
195L, 275L, 638L, 55L, 525L, 577L, 16L, 275L, 196L, 68L, 3L,
26L, 56L, 512L, 254L, 245L), `periprate1e-3` = c(0.0613026819923372,
0.285714285714286, 0.247863247863248, 0.182975627500909, 0.0358974358974359,
0.254545454545455, 0.0125391849529781, 0, 0.157794676806084,
0.131715771230503, 0.0625, 0.0654545454545455, 0.38265306122449,
0.0735294117647059, 0, 0.0384615384615385, 0.0535714285714286,
0.09765625, 0.259842519685039, 0.257142857142857)), row.names = c(NA,
-20L), class = c("tbl_df", "tbl", "data.frame"), .internal.selfref = <pointer: 0x55ccd018d230>)
The code I tried first was:
library(tidyverse)
data %>%
mutate(
cache_rate = cut(cache_rate, breaks = seq(0 , 1, by = 0.1)),
`periprate1e-3` = cut(`periprate1e-3`, breaks = seq(0 , 1, by = 0.1))
)
But it brings me some NA values:
# A tibble: 20 x 10
label full e15 e20 id_0cov_0.8evalue_0… `archConsensus1e… hhArch cache_rate groupsize `periprate1e-3`
<chr> <int> <int> <int> <int> <chr> <chr> <fct> <int> <fct>
1 WP_078201646.… 1 2 1 1 LysR_substrate LysR_subs… (0,0.1] 261 (0,0.1]
2 WP_077753210.… 1 2 1 2 LysR_substrate LysR_subs… NA 28 (0.2,0.3]
3 WP_044287879.… 1 2 1 4 LysR_substrate LysR_subs… NA 351 (0.2,0.3]
4 WP_046711496.… 1 2 1 5 LysR_substrate LysR_subs… (0,0.1] 2749 (0.1,0.2]
5 WP_069060785.… 1 2 1 6 LysR_substrate LysR_subs… (0,0.1] 195 (0,0.1]
6 WP_011394873.… 1 2 1 9 LysR_substrate LysR_subs… (0,0.1] 275 (0.2,0.3]
7 WP_015146987.… 1 2 1 11 PBP_like PBP_like (0,0.1] 638 (0,0.1]
8 WP_085748967.… 1 2 1 13 LysR_substrate LysR_subs… NA 55 NA
9 NP_696283.1..… 1 2 1 14 LysR_substrate LysR_subs… (0,0.1] 525 (0.1,0.2]
10 WP_011925568.… 1 2 1 17 LysR_substrate LysR_subs… (0,0.1] 577 (0.1,0.2]
11 WP_013040867.… 1 2 1 19 LysR_substrate LysR_subs… NA 16 (0,0.1]
12 WP_062116680.… 1 2 1 22 LysR_substrate LysR_subs… (0,0.1] 275 (0,0.1]
13 WP_082057246.… 1 2 1 23 LysR_substrate LysR_subs… (0,0.1] 196 (0.3,0.4]
14 WP_079078020.… 1 2 1 25 LysR_substrate LysR_subs… NA 68 (0,0.1]
15 WP_043081767.… 1 2 1 31 LysR_substrate LysR_subs… NA 3 NA
16 WP_085760186.… 1 2 1 37 LysR_substrate LysR_subs… NA 26 (0,0.1]
17 WP_052427986.… 1 2 1 38 LysR_substrate LysR_subs… (0,0.1] 56 (0,0.1]
18 WP_071039302.… 1 2 1 42 LysR_substrate LysR_subs… NA 512 (0,0.1]
19 WP_012939355.… 1 2 1 44 LysR_substrate LysR_subs… (0,0.1] 254 (0.2,0.3]
20 WP_012630775.… 1 2 1 45 LysR_substrate LysR_subs… NA 245 (0.2,0.3]
Then I tried to fix by changing the range within cut function:
data %>%
mutate(
cache_rate = cut(cache_rate, breaks = seq(-0.9 , 1, by = 0.1)),
`periprate1e-3` = cut(`periprate1e-3`, breaks = seq(-0.9 , 1, by = 0.1))
)
But the result is not too glance to plot given the negative values:
# A tibble: 20 x 10
label full e15 e20 id_0cov_0.8evalue_0… `archConsensus1e… hhArch cache_rate groupsize `periprate1e-3`
<chr> <int> <int> <int> <int> <chr> <chr> <fct> <int> <fct>
1 WP_078201646.… 1 2 1 1 LysR_substrate LysR_subs… (0,0.1] 261 (0,0.1]
2 WP_077753210.… 1 2 1 2 LysR_substrate LysR_subs… (-0.1,0] 28 (0.2,0.3]
3 WP_044287879.… 1 2 1 4 LysR_substrate LysR_subs… (-0.1,0] 351 (0.2,0.3]
4 WP_046711496.… 1 2 1 5 LysR_substrate LysR_subs… (0,0.1] 2749 (0.1,0.2]
5 WP_069060785.… 1 2 1 6 LysR_substrate LysR_subs… (0,0.1] 195 (0,0.1]
6 WP_011394873.… 1 2 1 9 LysR_substrate LysR_subs… (0,0.1] 275 (0.2,0.3]
7 WP_015146987.… 1 2 1 11 PBP_like PBP_like (0,0.1] 638 (0,0.1]
8 WP_085748967.… 1 2 1 13 LysR_substrate LysR_subs… (-0.1,0] 55 (-0.1,0]
9 NP_696283.1..… 1 2 1 14 LysR_substrate LysR_subs… (0,0.1] 525 (0.1,0.2]
10 WP_011925568.… 1 2 1 17 LysR_substrate LysR_subs… (0,0.1] 577 (0.1,0.2]
11 WP_013040867.… 1 2 1 19 LysR_substrate LysR_subs… (-0.1,0] 16 (0,0.1]
12 WP_062116680.… 1 2 1 22 LysR_substrate LysR_subs… (0,0.1] 275 (0,0.1]
13 WP_082057246.… 1 2 1 23 LysR_substrate LysR_subs… (0,0.1] 196 (0.3,0.4]
14 WP_079078020.… 1 2 1 25 LysR_substrate LysR_subs… (-0.1,0] 68 (0,0.1]
15 WP_043081767.… 1 2 1 31 LysR_substrate LysR_subs… (-0.1,0] 3 (-0.1,0]
16 WP_085760186.… 1 2 1 37 LysR_substrate LysR_subs… (-0.1,0] 26 (0,0.1]
17 WP_052427986.… 1 2 1 38 LysR_substrate LysR_subs… (0,0.1] 56 (0,0.1]
18 WP_071039302.… 1 2 1 42 LysR_substrate LysR_subs… (-0.1,0] 512 (0,0.1]
19 WP_012939355.… 1 2 1 44 LysR_substrate LysR_subs… (0,0.1] 254 (0.2,0.3]
20 WP_012630775.… 1 2 1 45 LysR_substrate LysR_subs… (-0.1,0] 245 (0.2,0.3]
data %>%
mutate(
cache_rate2 = cut(cache_rate, breaks = seq(-0.9 , 1, by = 0.1)),
`periprate1e-3_2` = cut(`periprate1e-3`, breaks = seq(-0.9 , 1, by = 0.1))
) %>%
ggplot(aes(cache_rate, `periprate1e-3`, color = cache_rate2, shape = `periprate1e-3_2`)) +
geom_point()
How do I discretize this vector without a mutate filled with disturbing case_when.
Thanks in advance
You are getting NA because the cut function by default excludes values at the lowest value of the first break. If you add include.lowest = TRUE, your problem disappears:
data %>%
mutate(
cache_rate = cut(cache_rate, breaks = 0:10/10, include.lowest = TRUE),
`periprate1e-3` = cut(`periprate1e-3`, breaks = 0:10/10, include.lowest = TRUE)
)
#> # A tibble: 20 x 10
#> label full e15 e20 id_0cov_0.8eval~ `archConsensus1~ hhArch cache_rate
#> <chr> <int> <int> <int> <int> <chr> <chr> <fct>
#> 1 WP_0~ 1 2 1 1 LysR_substrate LysR_~ [0,0.1]
#> 2 WP_0~ 1 2 1 2 LysR_substrate LysR_~ [0,0.1]
#> 3 WP_0~ 1 2 1 4 LysR_substrate LysR_~ [0,0.1]
#> 4 WP_0~ 1 2 1 5 LysR_substrate LysR_~ [0,0.1]
#> 5 WP_0~ 1 2 1 6 LysR_substrate LysR_~ [0,0.1]
#> 6 WP_0~ 1 2 1 9 LysR_substrate LysR_~ [0,0.1]
#> 7 WP_0~ 1 2 1 11 PBP_like PBP_l~ [0,0.1]
#> 8 WP_0~ 1 2 1 13 LysR_substrate LysR_~ [0,0.1]
#> 9 NP_6~ 1 2 1 14 LysR_substrate LysR_~ [0,0.1]
#> 10 WP_0~ 1 2 1 17 LysR_substrate LysR_~ [0,0.1]
#> 11 WP_0~ 1 2 1 19 LysR_substrate LysR_~ [0,0.1]
#> 12 WP_0~ 1 2 1 22 LysR_substrate LysR_~ [0,0.1]
#> 13 WP_0~ 1 2 1 23 LysR_substrate LysR_~ [0,0.1]
#> 14 WP_0~ 1 2 1 25 LysR_substrate LysR_~ [0,0.1]
#> 15 WP_0~ 1 2 1 31 LysR_substrate LysR_~ [0,0.1]
#> 16 WP_0~ 1 2 1 37 LysR_substrate LysR_~ [0,0.1]
#> 17 WP_0~ 1 2 1 38 LysR_substrate LysR_~ [0,0.1]
#> 18 WP_0~ 1 2 1 42 LysR_substrate LysR_~ [0,0.1]
#> 19 WP_0~ 1 2 1 44 LysR_substrate LysR_~ [0,0.1]
#> 20 WP_0~ 1 2 1 45 LysR_substrate LysR_~ [0,0.1]
#> # ... with 2 more variables: groupsize <int>, `periprate1e-3` <fct>
Related
So i need to merge 2 data frames:
The first data frame contains dates in YYYY-mm-dd format and event lengths:
datetime length
2003-06-03 1
2003-06-07 1
2003-06-13 1
2003-06-17 3
2003-06-28 5
2003-07-10 1
2003-07-23 1
...
The second data frame contains dates in the same format and discharge data:
datetime q
2003-05-29 36.2
2003-05-30 34.6
2003-05-31 33.1
2003-06-01 30.7
2003-06-02 30.0
2003-06-03 153.0
2003-06-04 69.0
...
The second data frame is much larger.
I want to merge/join only the following rows of the second data frame to the first:
all rows that have the same date as the first frame (I know this can be done with left_join(df1,df2, by = c("datetime"))
two rows before that row
n-1 rows after that row, where n = "length" value of row in first data frame.
I would like to identify the rows belonging to the same event as well.
Ideally i would have the following output: (Notice the event from 2003-06-17)
EventDatesNancy length q event#
2003-06-03 1 153.0 1
2003-06-07 1 120.0 2
2003-06-13 1 45.3 3
2003-06-15 na 110.0 4
2003-06-16 na 53.1 4
2003-06-17 3 78.0 4
2003-06-18 na 167.0 4
2003-06-19 na 145.0 4
...
I hope this makes clear what I am trying to do.
This might be one approach using tidyverse and fuzzyjoin.
First, indicate event numbers in your first data.frame. Add two columns to indicate the start and end dates (start date is 2 days before the date, and end date is length days - 1 after the date).
Then, you can use fuzzy_inner_join to get the selected rows from the second data.frame. Here, you will want to include where the datetime in the second data.frame falls after the start date and before the end date of the first data.frame.
library(tidyverse)
library(fuzzyjoin)
df1$event <- seq_along(1:nrow(df1))
df1$start_date <- df1$datetime - 2
df1$end_date <- df1$datetime + df1$length - 1
fuzzy_inner_join(
df1,
df2,
by = c("start_date" = "datetime", "end_date" = "datetime"),
match_fun = c(`<=`, `>=`)
) %>%
select(datetime.y, length, q, event)
I tried this out with some made up data:
R> df1
datetime length
1 2003-06-03 1
2 2003-06-12 1
3 2003-06-21 1
4 2003-06-30 3
5 2003-07-09 5
6 2003-07-18 1
7 2003-07-27 1
8 2003-08-05 2
9 2003-08-14 1
10 2003-08-23 1
11 2003-09-01 3
R> df2
datetime q
1 2003-06-03 44
2 2003-06-04 52
3 2003-06-05 34
4 2003-06-06 20
5 2003-06-07 57
6 2003-06-08 67
7 2003-06-09 63
8 2003-06-10 51
9 2003-06-11 56
10 2003-06-12 37
11 2003-06-13 16
12 2003-06-14 54
13 2003-06-15 46
14 2003-06-16 6
15 2003-06-17 32
16 2003-06-18 91
17 2003-06-19 61
18 2003-06-20 42
19 2003-06-21 28
20 2003-06-22 98
21 2003-06-23 77
22 2003-06-24 81
23 2003-06-25 13
24 2003-06-26 15
25 2003-06-27 73
26 2003-06-28 38
27 2003-06-29 27
28 2003-06-30 49
29 2003-07-01 10
30 2003-07-02 89
31 2003-07-03 9
32 2003-07-04 80
33 2003-07-05 68
34 2003-07-06 26
35 2003-07-07 31
36 2003-07-08 29
37 2003-07-09 84
38 2003-07-10 60
39 2003-07-11 19
40 2003-07-12 97
41 2003-07-13 35
42 2003-07-14 47
43 2003-07-15 70
This will give the following output:
datetime.y length q event
1 2003-06-03 1 44 1
2 2003-06-10 1 51 2
3 2003-06-11 1 56 2
4 2003-06-12 1 37 2
5 2003-06-19 1 61 3
6 2003-06-20 1 42 3
7 2003-06-21 1 28 3
8 2003-06-28 3 38 4
9 2003-06-29 3 27 4
10 2003-06-30 3 49 4
11 2003-07-01 3 10 4
12 2003-07-02 3 89 4
13 2003-07-07 5 31 5
14 2003-07-08 5 29 5
15 2003-07-09 5 84 5
16 2003-07-10 5 60 5
17 2003-07-11 5 19 5
18 2003-07-12 5 97 5
19 2003-07-13 5 35 5
If the output desired is different than above, please let me know what should be different so that I can correct it.
Data
df1 <- structure(list(datetime = structure(c(12206, 12215, 12224, 12233,
12242, 12251, 12260, 12269, 12278, 12287, 12296), class = "Date"),
length = c(1, 1, 1, 3, 5, 1, 1, 2, 1, 1, 3), event = 1:11,
start_date = structure(c(12204, 12213, 12222, 12231, 12240,
12249, 12258, 12267, 12276, 12285, 12294), class = "Date"),
end_date = structure(c(12206, 12215, 12224, 12235, 12246,
12251, 12260, 12270, 12278, 12287, 12298), class = "Date")), row.names = c(NA,
-11L), class = "data.frame")
df2 <- structure(list(datetime = structure(c(12206, 12207, 12208, 12209,
12210, 12211, 12212, 12213, 12214, 12215, 12216, 12217, 12218,
12219, 12220, 12221, 12222, 12223, 12224, 12225, 12226, 12227,
12228, 12229, 12230, 12231, 12232, 12233, 12234, 12235, 12236,
12237, 12238, 12239, 12240, 12241, 12242, 12243, 12244, 12245,
12246, 12247, 12248), class = "Date"), q = c(44L, 52L, 34L, 20L,
57L, 67L, 63L, 51L, 56L, 37L, 16L, 54L, 46L, 6L, 32L, 91L, 61L,
42L, 28L, 98L, 77L, 81L, 13L, 15L, 73L, 38L, 27L, 49L, 10L, 89L,
9L, 80L, 68L, 26L, 31L, 29L, 84L, 60L, 19L, 97L, 35L, 47L, 70L
)), class = "data.frame", row.names = c(NA, -43L))
I have a dataframe like this:
ID year age wage
1 2 1981 22 10000
2 2 1982 23 11000
3 2 1983 24 11500
4 2 1984 25 11000
5 2 1985 26 14000
6 2 1986 27 16000
7 2 1987 28 20000
8 2 1988 29 19000
9 2 1989 30 20000
10 2 1990 31 20000
11 2 1991 32 22000
12 2 1992 33 25000
13 2 1993 34 0
14 2 1994 35 NA
15 2 1995 36 0
16 2 1996 37 NA
17 2 1997 38 0
18 2 1998 39 NA
19 2 1999 40 0
20 2 2000 41 NA
21 2 2001 42 0
22 2 2002 43 NA
23 2 2003 44 0
24 2 2004 45 NA
25 2 2005 46 5500
26 2 2006 47 NA
27 2 2007 48 5000
28 2 2008 49 NA
29 2 2009 50 6000
30 2 2010 51 NA
31 2 2011 52 19000
32 2 2012 53 NA
33 2 2013 54 21000
34 2 2014 55 NA
35 2 2015 56 23000
36 3 1984 22 1300
37 3 1985 23 0
38 3 1986 24 1500
39 3 1987 25 1000
40 3 1988 26 0
I want to use an individual-specific regression of wage on age and age-squared to impute missing wage observations. I want to only impute when at least 5 non-missing observations are available.
As suggested by jay.sf, I tried the following but with fitted values:
df_imp <- do.call(rbind,
by(df, df$ID, function(x) {
IDs <- which(is.na(x$wage))
if (length(x$wage[- IDs]) >= 5) {
b <- lm(wage ~ poly(age, 2, raw=TRUE), x)$fitted.values
x$wage[IDs] <- with(x, b)[IDs]
}
return(x)
}))
I got the following results:
ID year age wage
36 2 1981 22 10000.000
37 2 1982 23 11000.000
38 2 1983 24 11500.000
39 2 1984 25 11000.000
40 2 1985 26 14000.000
41 2 1986 27 16000.000
42 2 1987 28 20000.000
43 2 1988 29 19000.000
44 2 1989 30 20000.000
45 2 1990 31 20000.000
46 2 1991 32 22000.000
47 2 1992 33 25000.000
48 2 1993 34 0.000
49 2 1994 35 7291.777
50 2 1995 36 0.000
51 2 1996 37 6779.133
52 2 1997 38 0.000
53 2 1998 39 7591.597
54 2 1999 40 0.000
55 2 2000 41 9729.168
56 2 2001 42 0.000
57 2 2002 43 13191.847
58 2 2003 44 0.000
59 2 2004 45 17979.633
60 2 2005 46 5500.000
61 2 2006 47 NA
62 2 2007 48 5000.000
63 2 2008 49 NA
64 2 2009 50 6000.000
65 2 2010 51 NA
66 2 2011 52 19000.000
67 2 2012 53 NA
68 2 2013 54 21000.000
69 2 2014 55 NA
70 2 2015 56 23000.000
You could use a simple if statement, without an else. Define an ID vector IDs that identifies missings, which you use to count them and to subset your Y column wage.
For this you can use by(), which splits your data similar to split() but you may apply a function; just rbind the result.
It's probably wiser to rather use the coefficients than the fitted values, because the latter also would be NA if your Y are NA. And you need to use raw=TRUE in the poly.
DF.imp <- do.call(rbind,
by(DF, DF$ID, function(x) {
IDs <- which(is.na(x$wage))
if (length(x$wage[- IDs]) >= 5) {
b <- lm(wage ~ poly(age, 2, raw=TRUE), x)$coefficients
x$wage[IDs] <- with(x, (b[1] + b[2]*age + b[3]*age^2))[IDs]
}
return(x)
}))
Note that I've slightly changed your example data, so that ID 3 also has missings, but less than 5 non-missings.
Result
DF.imp
# ID year age wage
# 2.1 2 1981 22 10000.000
# 2.2 2 1982 23 11000.000
# 2.3 2 1983 24 11500.000
# 2.4 2 1984 25 11000.000
# 2.5 2 1985 26 14000.000
# 2.6 2 1986 27 16000.000
# 2.7 2 1987 28 20000.000
# 2.8 2 1988 29 19000.000
# 2.9 2 1989 30 20000.000
# 2.10 2 1990 31 20000.000
# 2.11 2 1991 32 22000.000
# 2.12 2 1992 33 25000.000
# 2.13 2 1993 34 0.000
# 2.14 2 1994 35 7626.986
# 2.15 2 1995 36 0.000
# 2.16 2 1996 37 7039.387
# 2.17 2 1997 38 0.000
# 2.18 2 1998 39 6783.065
# 2.19 2 1999 40 0.000
# 2.20 2 2000 41 6858.020
# 2.21 2 2001 42 0.000
# 2.22 2 2002 43 7264.252
# 2.23 2 2003 44 0.000
# 2.24 2 2004 45 8001.761
# 2.25 2 2005 46 5500.000
# 2.26 2 2006 47 9070.546
# 2.27 2 2007 48 5000.000
# 2.28 2 2008 49 10470.609
# 2.29 2 2009 50 6000.000
# 2.30 2 2010 51 12201.948
# 2.31 2 2011 52 19000.000
# 2.32 2 2012 53 14264.565
# 2.33 2 2013 54 21000.000
# 2.34 2 2014 55 16658.458
# 2.35 2 2015 56 23000.000
# 3.36 3 1984 22 1300.000
# 3.37 3 1985 23 NA
# 3.38 3 1986 24 1500.000
# 3.39 3 1987 25 1000.000
# 3.40 3 1988 26 NA
Data
DF <- structure(list(ID = c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L), year = c(1981L,
1982L, 1983L, 1984L, 1985L, 1986L, 1987L, 1988L, 1989L, 1990L,
1991L, 1992L, 1993L, 1994L, 1995L, 1996L, 1997L, 1998L, 1999L,
2000L, 2001L, 2002L, 2003L, 2004L, 2005L, 2006L, 2007L, 2008L,
2009L, 2010L, 2011L, 2012L, 2013L, 2014L, 2015L, 1984L, 1985L,
1986L, 1987L, 1988L), age = c(22L, 23L, 24L, 25L, 26L, 27L, 28L,
29L, 30L, 31L, 32L, 33L, 34L, 35L, 36L, 37L, 38L, 39L, 40L, 41L,
42L, 43L, 44L, 45L, 46L, 47L, 48L, 49L, 50L, 51L, 52L, 53L, 54L,
55L, 56L, 22L, 23L, 24L, 25L, 26L), wage = c(10000L, 11000L,
11500L, 11000L, 14000L, 16000L, 20000L, 19000L, 20000L, 20000L,
22000L, 25000L, 0L, NA, 0L, NA, 0L, NA, 0L, NA, 0L, NA, 0L, NA,
5500L, NA, 5000L, NA, 6000L, NA, 19000L, NA, 21000L, NA, 23000L,
1300L, NA, 1500L, 1000L, NA)), row.names = c(NA, -40L), class = "data.frame")
I have two dataframes.
mydata1=structure(list(ID_WORKES = c(58005854L, 58005854L, 58002666L,
58002666L), ID_SP_NAR = c(463L, 1951L, 21L, 465L), KOD_DEPO = c(3786L,
3786L, 1439L, 1439L), KOD_DOR = c(58L, 58L, 92L, 92L), COLUMN_MASH = c(6L,
6L, 5L, 5L), prop_violations = structure(c(1L, 2L, 2L, 2L), .Label = c("0.2",
"1"), class = "factor"), mash_score = c(0L, 2L, 2L, 2L)), .Names = c("ID_WORKES",
"ID_SP_NAR", "KOD_DEPO", "KOD_DOR", "COLUMN_MASH", "prop_violations",
"mash_score"), class = "data.frame", row.names = c(NA, -4L))
mydata2=structure(list(ID_SP_NAR = c(463L, 1951L, 21L, 465L, 500L, 600L
)), .Names = "ID_SP_NAR", class = "data.frame", row.names = c(NA,
-6L))
i need crossjoin merge these dataframes by ID_SP_NAR. Mydata2 contatins only key variable ID_SP_NAR.
I need join this in such a way that if the id_workers does not have any codes from the ID_SP_NAR from mydata2, then these code are inserted into the dataset, but for them in variables prop_violations and mash_score must be inserted zero values.
I.E. SP_ID_NAR in mydata2 has such values
ID_SP_NAR
463
1951
21
465
500
600
ID_workes =58005854 has
463,
1951
but another not have.
and
ID_workes =58002666 has 21 and 465 and not anonter!
So desired output after cross join
ID_WORKES ID_SP_NAR KOD_DEPO KOD_DOR COLUMN_MASH prop_violations mash_score
1 58005854 463 3786 58 6 0.2 0
2 58005854 1951 3786 58 6 1 2
3 58005854 21 3786 58 6 0 0
4 58005854 465 3786 58 6 0 0
5 58005854 500 3786 58 6 0 0
6 58005854 600 3786 58 6 0 0
7 58002666 21 1439 92 5 1 2
8 58002666 465 1439 92 5 1 2
9 58002666 500 1439 92 5 0 0
10 58002666 600 1439 92 5 0 0
11 58002666 463 1439 92 5 0 0
12 58002666 1951 1439 92 5 0 0
KOD_DEPO,KOD_DOR,COLUMN_MASH have fixed value , it must be saved too.
How to do that?
merge(mydata1,mydata2, by = ID_SP_NAR) is not working( i try use via left join doesn't work), it doesn't insert zeros as i want .
We could use complete from tidyr to expand the dataset based on the 'ID_WORKES' and the valuse of 'ID_SP_NAR' in the second dataset
library(tidyverse)
mydata1 %>%
mutate_if(is.factor, as.character) %>%
complete(ID_WORKES, ID_SP_NAR = mydata2$ID_SP_NAR,
fill = list(prop_violations = '0', mash_score = 0)) %>%
fill(3:5)
# A tibble: 12 x 7
# ID_WORKES ID_SP_NAR KOD_DEPO KOD_DOR COLUMN_MASH prop_violations mash_score
# <int> <int> <int> <int> <int> <chr> <dbl>
# 1 58002666 21 1439 92 5 1 2
# 2 58002666 463 1439 92 5 0 0
# 3 58002666 465 1439 92 5 1 2
# 4 58002666 500 1439 92 5 0 0
# 5 58002666 600 1439 92 5 0 0
# 6 58002666 1951 1439 92 5 0 0
# 7 58005854 21 1439 92 5 0 0
# 8 58005854 463 3786 58 6 0.2 0
# 9 58005854 465 3786 58 6 0 0
#10 58005854 500 3786 58 6 0 0
#11 58005854 600 3786 58 6 0 0
#12 58005854 1951 3786 58 6 1 2
Thanks in advance for any help or suggestions on this. Here is a shortened example of the dataframe I am working with.
boxscore_stats = structure(list(game_id = c(157046L, 157046L, 157046L, 157046L,
157046L, 157046L, 157046L, 157046L, 157046L, 157046L, 157046L,
157046L, 157046L, 157046L, 157046L, 157046L, 157046L, 157046L,
159151L, 159151L, 159151L, 159151L, 159151L, 159151L, 159151L,
159151L, 159151L, 159151L, 159151L, 159151L, 159151L, 159151L,
159151L, 159151L, 159151L, 159151L, 159151L, 159151L, 159151L,
159151L), team_id = c(116975, 116975, 116975, 116975, 116975,
116975, 116975, 116975, 116975, 120310, 120310, 120310, 120310,
120310, 120310, 120310, 120310, 120310, 121910, 121910, 121910,
121910, 121910, 121910, 121910, 121910, 121910, 121910, 122072,
122072, 122072, 122072, 122072, 122072, 122072, 122072, 122072,
122072, 122072, 122072), minutes_played = c(18.76, 14.63, 8,
16.69, 24.62, 32, 12.79, 5.28, 3.22, 24.35, 10.18, 20.65, 9.59,
25.08, 14.12, 17.46, 23.15, 15.43, 22.84, 19.27, 21.31, 6.41,
17.57, 17.4, 17.29, 7.22, 12.09, 17.25, 2.28, 16.87, 6.6, 19.73,
6.31, 13.25, 26.25, 6.08, 28.71, 11.2, 17.54, 5.17), fieldGoalsMade = c(1L,
1L, 4L, 1L, 2L, 7L, 1L, 1L, 1L, 4L, 0L, 3L, 1L, 3L, 0L, 6L, 7L,
1L, 7L, 4L, 5L, 1L, 2L, 6L, 2L, 0L, 1L, 3L, 0L, 1L, 1L, 3L, 0L,
1L, 11L, 2L, 5L, 1L, 2L, 1L), fieldGoalAttempts = c(8L, 6L, 7L,
2L, 9L, 16L, 3L, 1L, 2L, 12L, 4L, 12L, 3L, 11L, 4L, 9L, 13L,
6L, 12L, 10L, 14L, 2L, 6L, 11L, 6L, 2L, 2L, 6L, 0L, 5L, 3L, 10L,
2L, 3L, 21L, 3L, 17L, 4L, 9L, 2L)), .Names = c("game_id", "team_id",
"minutes_played", "fieldGoalsMade", "fieldGoalAttempts"), row.names = c(NA,
40L), class = "data.frame")
head(boxscore_stats)
game_id team_id minutes_played fieldGoalsMade fieldGoalAttempts
1 157046 116975 18.76 1 8
2 157046 116975 14.63 1 6
3 157046 116975 8.00 4 7
4 157046 116975 16.69 1 2
5 157046 116975 24.62 2 9
6 157046 116975 32.00 7 16
7 157046 116975 12.79 1 3
8 157046 116975 5.28 1 1
9 157046 116975 3.22 1 2
10 157046 120310 24.35 4 12
11 157046 120310 10.18 0 4
12 157046 120310 20.65 3 12
13 157046 120310 9.59 1 3
14 157046 120310 25.08 3 11
15 157046 120310 14.12 0 4
16 157046 120310 17.46 6 9
17 157046 120310 23.15 7 13
18 157046 120310 15.43 1 6
19 159151 121910 22.84 7 12
20 159151 121910 19.27 4 10
21 159151 121910 21.31 5 14
22 159151 121910 6.41 1 2
23 159151 121910 17.57 2 6
24 159151 121910 17.40 6 11
25 159151 121910 17.29 2 6
26 159151 121910 7.22 0 2
27 159151 121910 12.09 1 2
28 159151 121910 17.25 3 6
29 159151 122072 2.28 0 0
30 159151 122072 16.87 1 5
31 159151 122072 6.60 1 3
32 159151 122072 19.73 3 10
33 159151 122072 6.31 0 2
34 159151 122072 13.25 1 3
35 159151 122072 26.25 11 21
36 159151 122072 6.08 2 3
37 159151 122072 28.71 5 17
38 159151 122072 11.20 1 4
39 159151 122072 17.54 2 9
40 159151 122072 5.17 1 2
The important things to note about this dataframe is that each game_id corresponds with two team_ids, for the two teams that played in the game. Each game_id is unique to one game of basketball. Each row corresponds with the stats for a player on the team_ids team in that game. The example above has only two games / 4 teams / 40 players, but my full dataframe has hundreds of games, which each team_id showing up many times.
My first aggregation, which I was able to do, was to aggregate everything by team_id. This code got the job done for me for the first aggregation:
boxscore_stats_aggregated = aggregate(boxscore_stats, by = list(boxscore_stats[, 2]), FUN = sum)
which was fairly straightforward. For any team_id, I had aggregated all of their minutes played, all of their fieldGoalsMade, etc. For my next aggregation though, I need to aggregate by team_id again but instead of aggregating a team by their own rows / stats, instead I need to aggregate the rows / stats of their opponents. This answers the question "For any team, how many fieldsGoalsMade did they allow in total to opponents, etc." So in this case, for team_id = 116975, I would want to aggregate all the rows with team_id 120310. Of course next time team_id 116975 appears in my dataframe in a new game, it is likely that they are playing a different opponent, so this aggregation is not as simple as aggregating by team_id 120310.
I think I should be able to use the relationship between the two team_ids being unique to the unique game_ids to make this aggregation possible, but I am struggling with how it could be implemented.
Thanks!
Here is approach using data.table:
(1) Read in the data:
# Load package
library(data.table)
# Load your data
boxscore_stats <- fread("row game_id team_id minutes_played fieldGoalsMade fieldGoalAttempts
1 157046 116975 18.76 1 8
2 157046 116975 14.63 1 6
3 157046 116975 8.00 4 7
4 157046 116975 16.69 1 2
5 157046 116975 24.62 2 9
6 157046 116975 32.00 7 16
7 157046 116975 12.79 1 3
8 157046 116975 5.28 1 1
9 157046 116975 3.22 1 2
10 157046 120310 24.35 4 12
11 157046 120310 10.18 0 4
12 157046 120310 20.65 3 12
13 157046 120310 9.59 1 3
14 157046 120310 25.08 3 11
15 157046 120310 14.12 0 4
16 157046 120310 17.46 6 9
17 157046 120310 23.15 7 13
18 157046 120310 15.43 1 6
19 159151 121910 22.84 7 12
20 159151 121910 19.27 4 10
21 159151 121910 21.31 5 14
22 159151 121910 6.41 1 2
23 159151 121910 17.57 2 6
24 159151 121910 17.40 6 11
25 159151 121910 17.29 2 6
26 159151 121910 7.22 0 2
27 159151 121910 12.09 1 2
28 159151 121910 17.25 3 6
29 159151 122072 2.28 0 0
30 159151 122072 16.87 1 5
31 159151 122072 6.60 1 3
32 159151 122072 19.73 3 10
33 159151 122072 6.31 0 2
34 159151 122072 13.25 1 3
35 159151 122072 26.25 11 21
36 159151 122072 6.08 2 3
37 159151 122072 28.71 5 17
38 159151 122072 11.20 1 4
39 159151 122072 17.54 2 9
40 159151 122072 5.17 1 2
")
(2) Proceed with the actual calculations:
# Aggregate on team-and game level (data.table style)
boxscore_stats_aggregated <- boxscore_stats[, lapply(.SD, sum), by = list(game_id, team_id)]
# Match EVERY team to each opponent, i.e. still two rows per game
# but columns for opponent's performance added.
# Some teams drops out in the dummy data as they opponent data was missing.
merge(boxscore_stats_aggregated, boxscore_stats_aggregated,
by="game_id", suffixes = c("", ".opponent"))[team_id!=team_id.opponent,]
output looks like that:
# > output
# game_id team_id row minutes_played fieldGoalsMade fieldGoalAttempts team_id.opponent row.opponent minutes_played.opponent fieldGoalsMade.opponent fieldGoalAttempts.opponent
# 1: 1413414 116975 45 135.99 19 54 120310 126 160.01 25 74
# 2: 1413414 120310 126 160.01 25 74 116975 45 135.99 19 54
And just in case, for OP to consider or future readers below is a base R version with merge() for side by side aggregates of team and opposition by game_id. A staging temp col, gamecount, is needed.
# TEAM AGGREGATION
aggdf <- aggregate(.~game_id + team_id, boxscore_stats, FUN = sum)
# GAME COUNT BY TEAM (TEMP COL USED FOR MERGE/FILTER)
aggdf$gamecount <- sapply(1:nrow(aggdf), function(i)
sum(aggdf[1:i, c("game_id")] == aggdf$game_id[i]))
# MERGE AND FILTER
mdf <- merge(aggdf, aggdf, by="game_id")
mdf <- mdf[mdf$team_id.x != mdf$team_id.y & mdf$gamecount.x == 1,]
mdf$gamecount.x <- mdf$gamecount.y <- NULL
# RENAME COL AND ROW NAMES
names(mdf)[grepl("\\.x", names(mdf))] <- gsub("\\.x", "",
names(mdf)[grepl("\\.x", names(mdf))])
names(mdf)[grepl("\\.y", names(mdf))] <- gsub("\\.y", ".opp",
names(mdf)[grepl("\\.y", names(mdf))])
rownames(mdf) <- 1:nrow(mdf)
# game_id team_id minutes_played fieldGoalsMade fieldGoalAttempts team_id.opp
# 1 157046 116975 135.99 19 54 120310
# 2 159151 121910 158.65 31 71 122072
# minutes_played.opp fieldGoalsMade.opp fieldGoalAttempts.opp
# 1 160.01 25 74
# 2 159.99 28 79
If you want to isolate single team_ids, I would use the dplyr package.
For example if you wanted to know % of field goals per team I would write something like:
boxscore_stats %>%
group_by(team_id) %>%
summarize(perc_fg = sum(fieldGoalsMade)/sum(fieldGoalAttempts))
This would give you a new data.frame aggregated by team ID.
From the table below I need to combine the lines by calculating the average value for those lines with same ID (column 2).
I was thinking of the plyr function??
ddply(df, summarize, value = average(ID))
df:
miRNA ID 100G 100R 106G 106R 122G 122R 124G 124R 126G 126R 134G 134R 141G 141R 167G 167R 185G 185R
1 hsa-miR-106a ID7 1585 423 180 113 598 266 227 242 70 106 2703 442 715 309 546 113 358 309
2 hsa-miR-1185-1 ID2 10 1 3 3 11 8 4 4 28 2 13 3 6 3 6 4 7 5
3 hsa-miR-1185-2 ID2 2 0 2 1 5 1 1 0 4 1 1 1 3 2 2 0 2 1
4 hsa-miR-1197 ID2 2 0 0 5 3 3 0 4 16 0 4 1 3 0 0 2 2 4
5 hsa-miR-127 ID3 29 17 6 55 40 35 6 20 171 10 32 21 23 25 10 14 32 55
Summary of original data:
> str(ClusterMatrix)
'data.frame': 113 obs. of 98 variables:
$ miRNA: Factor w/ 202 levels "hsa-miR-106a",..: 1 3 4 6 8 8 14 15 15 16 ...
$ ID : Factor w/ 27 levels "ID1","ID10","ID11",..: 25 12 12 12 21 21 12 21 21 6 ...
$ 100G : Factor w/ 308 levels "-0.307749042739963",..: 279 11 3 3 101 42 139 158 215 222 ...
$ 100R : Factor w/ 316 levels "-0.138028803567403",..: 207 7 8 8 18 42 128 183 232 209 ...
$ 106G : Factor w/ 260 levels "-0.103556709881933",..: 171 4 1 3 7 258 95 110 149 162 ...
$ 106R : Factor w/ 300 levels "-0.141810346640204",..: 141 4 6 2 108 41 146 196 244 267 ...
$ 122G : Factor w/ 336 levels "-0.0409548922061764",..: 237 12 4 6 103 47 148 203 257 264 ...
$ 122R : Factor w/ 316 levels "-0.135708706475279",..: 177 1 8 6 36 44 131 192 239 244 ...
$ 124G : Factor w/ 267 levels "-0.348439853247856",..: 210 5 2 3 7 50 126 138 188 249 ...
$ 124R : Factor w/ 303 levels "-0.176414190219115",..: 193 3 7 3 21 52 167 200 238 239 ...
$ 126G : Factor w/ 307 levels "-0.227658806811544",..: 122 88 5 76 169 61 240 220 281 265 ...
$ 126R : Factor w/ 249 levels "-0.271925865853123",..: 119 1 2 3 11 247 78 110 151 193 ...
$ 134G : Factor w/ 344 levels "-0.106333543799583",..: 304 14 8 5 33 48 150 196 248 231 ...
$ 134R : Factor w/ 300 levels "-0.0997616469801097",..: 183 5 7 7 22 298 113 159 213 221 ...
$ 141G : Factor w/ 335 levels "-0.134429748398679",..: 253 7 3 3 24 29 142 137 223 302 ...
$ 141R : Factor w/ 314 levels "-0.143299688877927",..: 210 4 5 7 98 54 154 199 255 251 ...
$ 167G : Factor w/ 306 levels "-0.211181452126958",..: 222 7 4 6 11 292 91 101 175 226 ...
$ 167R : Factor w/ 282 levels "-0.0490740880560127",..: 130 2 6 4 15 282 110 146 196 197 ...
$ 185G : Factor w/ 317 levels "-0.0567841338235346",..: 218 2 7 7 33 34 130 194 227 259 ...
We can use dplyr. We group by 'ID', use mutate_each to create columns that show the mean value of '100G' to '185R'. We select the columns in mutate_each by using regex patterns in matches. Then cbind (bind_cols) the original dataset with the mutated columns, and convert to data.frame if needed. We can also change the column names of the mean columns.
library(dplyr)
out <- df1 %>%
group_by(ID) %>%
mutate_each(funs(mean=mean(., na.rm=TRUE)), matches('^\\d+')) %>%
setNames(., c(names(.)[1:2], paste0('Mean_', names(.)[3:ncol(.)]))) %>%
as.data.frame()
out1 <- bind_cols(df1, out[-(1:2)])
out1
# miRNA ID 100G 100R 106G 106R 122G 122R 124G 124R 126G 126R 134G
#1 hsa-miR-106a ID7 1585 423 180 113 598 266 227 242 70 106 2703
#2 hsa-miR-1185-1 ID2 10 1 3 3 11 8 4 4 28 2 13
#3 hsa-miR-1185-2 ID2 2 0 2 1 5 1 1 0 4 1 1
#4 hsa-miR-1197 ID2 2 0 0 5 3 3 0 4 16 0 4
#5 hsa-miR-127 ID3 29 17 6 55 40 35 6 20 171 10 32
# 134R 141G 141R 167G 167R 185G 185R Mean_100G Mean_100R Mean_106G
#1 442 715 309 546 113 358 309 1585.000000 423.0000000 180.000000
#2 3 6 3 6 4 7 5 4.666667 0.3333333 1.666667
#3 1 3 2 2 0 2 1 4.666667 0.3333333 1.666667
#4 1 3 0 0 2 2 4 4.666667 0.3333333 1.666667
#5 21 23 25 10 14 32 55 29.000000 17.0000000 6.000000
# Mean_106R Mean_122G Mean_122R Mean_124G Mean_124R Mean_126G Mean_126R
#1 113 598.000000 266 227.000000 242.000000 70 106
#2 3 6.333333 4 1.666667 2.666667 16 1
#3 3 6.333333 4 1.666667 2.666667 16 1
#4 3 6.333333 4 1.666667 2.666667 16 1
#5 55 40.000000 35 6.000000 20.000000 171 10
# Mean_134G Mean_134R Mean_141G Mean_141R Mean_167G Mean_167R Mean_185G
#1 2703 442.000000 715 309.000000 546.000000 113 358.000000
#2 6 1.666667 4 1.666667 2.666667 2 3.666667
#3 6 1.666667 4 1.666667 2.666667 2 3.666667
#4 6 1.666667 4 1.666667 2.666667 2 3.666667
#5 32 21.000000 23 25.000000 10.000000 14 32.000000
# Mean_185R
#1 309.000000
#2 3.333333
#3 3.333333
#4 3.333333
#5 55.000000
EDIT: If we need a single row mean for each 'ID', we can use summarise_each
df1 %>%
group_by(ID) %>%
summarise_each(funs(mean=mean(., na.rm=TRUE)), matches('^\\d+'))
EDIT2: Based on the OP's update the original dataset ('ClusterMatrix') columns are all factor class. We need to convert the columns to numeric class before getting the mean. There are two options to convert the factor to numeric - 1) by as.numeric(as.character(.. which may be a bit slower, 2) as.numeric(levels(.. which is faster. Here I am using the first method as it may be more clear.
ClusterMatrix %>%
group_by(ID) %>%
summarise_each(funs(mean= mean(as.numeric(as.character(.)),
na.rm=TRUE)), matches('^\\d+'))
data
df1 <- structure(list(miRNA = c("hsa-miR-106a", "hsa-miR-1185-1",
"hsa-miR-1185-2",
"hsa-miR-1197", "hsa-miR-127"), ID = c("ID7", "ID2", "ID2", "ID2",
"ID3"), `100G` = c(1585L, 10L, 2L, 2L, 29L), `100R` = c(423L,
1L, 0L, 0L, 17L), `106G` = c(180L, 3L, 2L, 0L, 6L), `106R` = c(113L,
3L, 1L, 5L, 55L), `122G` = c(598L, 11L, 5L, 3L, 40L), `122R` = c(266L,
8L, 1L, 3L, 35L), `124G` = c(227L, 4L, 1L, 0L, 6L), `124R` = c(242L,
4L, 0L, 4L, 20L), `126G` = c(70L, 28L, 4L, 16L, 171L), `126R` = c(106L,
2L, 1L, 0L, 10L), `134G` = c(2703L, 13L, 1L, 4L, 32L), `134R` = c(442L,
3L, 1L, 1L, 21L), `141G` = c(715L, 6L, 3L, 3L, 23L), `141R` = c(309L,
3L, 2L, 0L, 25L), `167G` = c(546L, 6L, 2L, 0L, 10L), `167R` = c(113L,
4L, 0L, 2L, 14L), `185G` = c(358L, 7L, 2L, 2L, 32L), `185R` = c(309L,
5L, 1L, 4L, 55L)), .Names = c("miRNA", "ID", "100G", "100R",
"106G", "106R", "122G", "122R", "124G", "124R", "126G", "126R",
"134G", "134R", "141G", "141R", "167G", "167R", "185G", "185R"
), class = "data.frame", row.names = c("1", "2", "3", "4", "5"
))