Convert string containing roman numerals to numeric using R - r

I want to convert "stage i", "stage ii", etc to numeric "1" and "2".
pheno_df$pathologic_stage <- gsub("stage ","",pheno_df$pathologic_stage)
as.numeric(factor(pheno_df$pathologic_stage))
Current output:
3 2 3 3 2 5
Desired output:
2 1 2 2 1 4
Data sample:
> dput(pheno_df$pathologic_stage)
c("stage ii", "stage i", "stage ii", "stage ii", "stage i", "stage iv",

Extract the numeral part, then convert to roman and back to numeric:
v <- c("stage ii", "stage i", "stage ii", "stage ii", "stage i", "stage iv")
as.numeric(as.roman(gsub("stage ", "", v)))
#[1] 2 1 2 2 1 4

Related

Apply a condition to a group when condition is only on one row

I have this data set as follows
structure(list(count = c("0-0", "1-0", "2-0", "2-1", "0-0", "0-1",
"0-2", "1-2", "1-2", "0-0", "0-1", "1-1", "1-2", "2-2", "2-2",
"0-0", "1-0", "1-1", "2-1", "3-1", "3-2", "0-0", "1-0", "1-1",
"0-0", "0-1", "1-1", "1-2", "0-0", "1-0", "1-1", "0-0", "0-1",
"0-0", "1-0", "1-1", "1-2", "0-0", "0-1", "0-2", "0-0", "0-1",
"0-2", "1-2", "1-2", "0-0", "0-0", "0-1", "0-0", "0-0", "0-0",
"1-0", "2-0", "0-0", "1-0", "2-0", "3-0", "0-0", "0-0", "1-0",
"1-1", "0-0", "0-0", "1-0", "2-0", "0-0", "0-1", "0-2", "0-2",
"0-0", "1-0", "1-1", "2-1", "2-2", "2-2", "0-0", "1-0", "2-0",
"2-1", "2-2", "0-0", "0-1", "0-0", "0-0", "0-1", "0-2", "0-2",
"1-2", "2-2", "0-0", "1-0", "1-1", "0-0", "1-0", "0-0", "0-1",
"1-1", "1-2"), pitchResult = c("Ball", "Ball",
"Foul", "Ground Out", "Foul", "Strike Looking", "Ball", "Foul",
"Ground Out", "Strike Looking", "Ball", "Foul", "Ball", "Foul",
"Ground Out", "Ball", "Strike Looking", "Ball", "Ball", "Strike Swinging",
"Single on a Fly Ball", "Ball", "Strike Swinging", "Double Play",
"Strike Looking", "Ball", "Strike Looking", "Ground Out", "Ball",
"Strike Swinging", "Ground Out", "Foul", "Single on a Fly Ball",
"Ball", "Strike Swinging", "Strike Swinging", "Strikeout (Swinging)",
"Strike Looking", "Foul", "Strikeout (Swinging)", "Strike Looking",
"Strike Looking", "Ball", "Foul", "Fly Out", "Fly Out", "Strike Looking",
"Fly Out", "Double on a Fly Ball", "Hit By Pitch", "Ball", "Ball",
"Fly Out", "Ball", "Ball", "Ball", "Walk", "Double Play", "Ball",
"Strike Looking", "Single on a Ground Ball", "Fly Out", "Ball",
"Ball", "Fly Out", "Strike Looking", "Foul", "Foul", "Single on a Ground Ball",
"Ball", "Strike Looking", "Ball", "Foul", "Foul", "Home Run on a 402.65 ft Fly Ball",
"Ball", "Ball", "Strike Swinging", "Foul", "Fly Out", "Strike Swinging",
"Line Out", "Fly Out", "Strike Looking", "Foul", "Foul", "Ball",
"Ball", "Single on a Ground Ball", "Ball", "Strike Looking",
"Fielder's Choice", "Ball", "Ground Out", "Foul", "Ball", "Strike Swinging",
"Single on a Line Drive"), gameId = c(536158720L,
536158720L, 536158720L, 536158720L, 536158720L, 536158720L, 536158720L,
536158720L, 536158720L, 536158720L, 536158720L, 536158720L, 536158720L,
536158720L, 536158720L, 536158720L, 536158720L, 536158720L, 536158720L,
536158720L, 536158720L, 536158720L, 536158720L, 536158720L, 536158720L,
536158720L, 536158720L, 536158720L, 536158720L, 536158720L, 536158720L,
536158720L, 536158720L, 536158720L, 536158720L, 536158720L, 536158720L,
536158720L, 536158720L, 536158720L, 536158720L, 536158720L, 536158720L,
536158720L, 536158720L, 536158720L, 536158720L, 536158720L, 536158720L,
536158720L, 536158720L, 536158720L, 536158720L, 536158720L, 536158720L,
536158720L, 536158720L, 536158720L, 536158720L, 536158720L, 536158720L,
536158720L, 536158720L, 536158720L, 536158720L, 536158720L, 536158720L,
536158720L, 536158720L, 536158720L, 536158720L, 536158720L, 536158720L,
536158720L, 536158720L, 536158720L, 536158720L, 536158720L, 536158720L,
536158720L, 536158720L, 536158720L, 536158720L, 536158720L, 536158720L,
536158720L, 536158720L, 536158720L, 536158720L, 536158720L, 536158720L,
536158720L, 536158720L, 536158720L, 536158720L, 536158720L, 536158720L,
536158720L), inn = c("Top 1", "Top 1",
"Top 1", "Top 1", "Top 1", "Top 1", "Top 1", "Top 1", "Top 1",
"Top 1", "Top 1", "Top 1", "Top 1", "Top 1", "Top 1", "Top 2",
"Top 2", "Top 2", "Top 2", "Top 2", "Top 2", "Top 2", "Top 2",
"Top 2", "Top 2", "Top 2", "Top 2", "Top 2", "Top 3", "Top 3",
"Top 3", "Top 3", "Top 3", "Top 3", "Top 3", "Top 3", "Top 3",
"Top 3", "Top 3", "Top 3", "Top 4", "Top 4", "Top 4", "Top 4",
"Top 4", "Top 4", "Top 4", "Top 4", "Top 5", "Top 5", "Top 5",
"Top 5", "Top 5", "Top 5", "Top 5", "Top 5", "Top 5", "Top 5",
"Top 6", "Top 6", "Top 6", "Top 6", "Top 6", "Top 6", "Top 6",
"Top 6", "Top 6", "Top 6", "Top 6", "Top 6", "Top 6", "Top 6",
"Top 6", "Top 6", "Top 6", "Top 6", "Top 6", "Top 6", "Top 6",
"Top 6", "Top 7", "Top 7", "Top 7", "Top 7", "Top 7", "Top 7",
"Top 7", "Top 7", "Top 7", "Top 7", "Top 7", "Top 7", "Top 8",
"Top 8", "Top 8", "Top 8", "Top 8", "Top 8"
), batter = c("Player A", "Player A", "Player A", "Player A", "Player B", "Player B",
"Player B", "Player B", "Player B", "Player C", "Player C", "Player C", "Player C",
"Player C", "Player C", "Player D", "Player D", "Player D", "Player D", "Player D",
"Player D", "Player E", "Player E", "Player E", "Player F", "Player F", "Player F",
"Player F", "Player G", "Player G", "Player G", "Player H", "Player H", "Player I",
"Player I", "Player I", "Player I", "Player A", "Player A", "Player A", "Player B",
"Player B", "Player B", "Player B", "Player B", "Player C", "Player D", "Player D",
"Player E", "Player F", "Player G", "Player G", "Player G", "Player H", "Player H",
"Player H", "Player H", "Player I", "Player A", "Player A", "Player A", "Player B",
"Player C", "Player C", "Player C", "Player D", "Player D", "Player D", "Player D",
"Player E", "Player E", "BPlayer E", "Player E", "Player E", "Player E", "Player F", "Player F",
"Player F", "Player F", "Player F", "Player G", "Player G", "Player H", "Player I",
"Player I", "Player I", "Player I", "Player I", "Player I", "Player A", "Player A", "Player A",
"Player B", "Player B", "Player C", "Player C", "Player C", "Player C"), pitcher = c("Player 1", "Player 1", "Player 1", "Player 1", "Player 1",
"Player 1", "Player 1", "Player 1", "Player 1", "Player 1", "Player 1", "Player 1",
"Player 1", "Player 1", "Player 1", "Player 1", "Player 1", "Player 1", "Player 1",
"Player 1", "Player 1", "Player 1", "Player 1", "Player 1", "Player 1", "Player 1",
"Player 1", "Player 1", "Player 1", "Player 1", "Player 1", "Player 1", "Player 1",
"Player 1", "Player 1", "Player 1", "Player 1", "Player 1", "Player 1", "Player 1",
"Player 1", "Player 1", "Player 1", "Player 1", "Player 1", "Player 1", "Player 1",
"Player 1", "Player 1", "Player 1", "Player 1", "Player 1", "Player 1", "Player 1",
"Player 1", "Player 1", "Player 1", "Player 1", "Player 1", "Player 1", "Player 1",
"Player 1", "Player 1", "Player 1", "Player 1", "Player 1", "Player 1", "Player 1",
"Player 1", "Player 1", "Player 1", "Player 1", "Player 1", "Player 1", "Player 1",
"Player 1", "Player 1", "Player 1", "Player 1", "Player 1", "Player 1", "Player 1",
"Player 2", "Player 2", "Player 2", "Player 2", "Player 2", "Player 2",
"Player 2", "Player 2", "Player 2", "Player 2", "Player 2", "Player 2",
"Player 2", "Player 2", "Player 2", "Player 2"
), bb = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)), row.names = c(NA,
100L), class = "data.frame")
Where Players A-I take at bats (groups of pitches) against Player 1 and 2... If Player A Walked or got Hit by the pitch in their at bat it is denoted in the bb column on the pitch where that event occurred. What I want to do is have that bb column show a 1 for the entire at bat if they were walked or hit by the pitch instead of just on the row where the event happened.
I am not familiar with baseball rules, but I tried to get the result that matches your description. Note that I found your data has 100 rows, but the 98th row and the 99th row are "NA" so I just use row 1 to row 98. Here is my trial:
dat[1:98,] %>%
group_by(batter, inn) %>%
mutate(wanted_bb = ifelse(any(bb ==1) & str_detect(pitchResult, "[Walk|Hit]"), 1, 0)) %>%
select(-bb) %>%
as.data.frame()
The result:
count pitchResult gameId inn batter pitcher wanted_bb
1 0-0 Ball 536158720 Top 1 Player A Player 1 0
2 1-0 Ball 536158720 Top 1 Player A Player 1 0
3 2-0 Foul 536158720 Top 1 Player A Player 1 0
4 2-1 Ground Out 536158720 Top 1 Player A Player 1 0
5 0-0 Foul 536158720 Top 1 Player B Player 1 0
6 0-1 Strike Looking 536158720 Top 1 Player B Player 1 0
7 0-2 Ball 536158720 Top 1 Player B Player 1 0
8 1-2 Foul 536158720 Top 1 Player B Player 1 0
9 1-2 Ground Out 536158720 Top 1 Player B Player 1 0
10 0-0 Strike Looking 536158720 Top 1 Player C Player 1 0
11 0-1 Ball 536158720 Top 1 Player C Player 1 0
12 1-1 Foul 536158720 Top 1 Player C Player 1 0
13 1-2 Ball 536158720 Top 1 Player C Player 1 0
14 2-2 Foul 536158720 Top 1 Player C Player 1 0
15 2-2 Ground Out 536158720 Top 1 Player C Player 1 0
16 0-0 Ball 536158720 Top 2 Player D Player 1 0
17 1-0 Strike Looking 536158720 Top 2 Player D Player 1 0
18 1-1 Ball 536158720 Top 2 Player D Player 1 0
19 2-1 Ball 536158720 Top 2 Player D Player 1 0
20 3-1 Strike Swinging 536158720 Top 2 Player D Player 1 0
21 3-2 Single on a Fly Ball 536158720 Top 2 Player D Player 1 0
22 0-0 Ball 536158720 Top 2 Player E Player 1 0
23 1-0 Strike Swinging 536158720 Top 2 Player E Player 1 0
24 1-1 Double Play 536158720 Top 2 Player E Player 1 0
25 0-0 Strike Looking 536158720 Top 2 Player F Player 1 0
26 0-1 Ball 536158720 Top 2 Player F Player 1 0
27 1-1 Strike Looking 536158720 Top 2 Player F Player 1 0
28 1-2 Ground Out 536158720 Top 2 Player F Player 1 0
29 0-0 Ball 536158720 Top 3 Player G Player 1 0
30 1-0 Strike Swinging 536158720 Top 3 Player G Player 1 0
31 1-1 Ground Out 536158720 Top 3 Player G Player 1 0
32 0-0 Foul 536158720 Top 3 Player H Player 1 0
33 0-1 Single on a Fly Ball 536158720 Top 3 Player H Player 1 0
34 0-0 Ball 536158720 Top 3 Player I Player 1 0
35 1-0 Strike Swinging 536158720 Top 3 Player I Player 1 0
36 1-1 Strike Swinging 536158720 Top 3 Player I Player 1 0
37 1-2 Strikeout (Swinging) 536158720 Top 3 Player I Player 1 0
38 0-0 Strike Looking 536158720 Top 3 Player A Player 1 0
39 0-1 Foul 536158720 Top 3 Player A Player 1 0
40 0-2 Strikeout (Swinging) 536158720 Top 3 Player A Player 1 0
41 0-0 Strike Looking 536158720 Top 4 Player B Player 1 0
42 0-1 Strike Looking 536158720 Top 4 Player B Player 1 0
43 0-2 Ball 536158720 Top 4 Player B Player 1 0
44 1-2 Foul 536158720 Top 4 Player B Player 1 0
45 1-2 Fly Out 536158720 Top 4 Player B Player 1 0
46 0-0 Fly Out 536158720 Top 4 Player C Player 1 0
47 0-0 Strike Looking 536158720 Top 4 Player D Player 1 0
48 0-1 Fly Out 536158720 Top 4 Player D Player 1 0
49 0-0 Double on a Fly Ball 536158720 Top 5 Player E Player 1 0
50 0-0 Hit By Pitch 536158720 Top 5 Player F Player 1 1
51 0-0 Ball 536158720 Top 5 Player G Player 1 0
52 1-0 Ball 536158720 Top 5 Player G Player 1 0
53 2-0 Fly Out 536158720 Top 5 Player G Player 1 0
54 0-0 Ball 536158720 Top 5 Player H Player 1 1
55 1-0 Ball 536158720 Top 5 Player H Player 1 1
56 2-0 Ball 536158720 Top 5 Player H Player 1 1
57 3-0 Walk 536158720 Top 5 Player H Player 1 1
58 0-0 Double Play 536158720 Top 5 Player I Player 1 0
59 0-0 Ball 536158720 Top 6 Player A Player 1 0
60 1-0 Strike Looking 536158720 Top 6 Player A Player 1 0
61 1-1 Single on a Ground Ball 536158720 Top 6 Player A Player 1 0
62 0-0 Fly Out 536158720 Top 6 Player B Player 1 0
63 0-0 Ball 536158720 Top 6 Player C Player 1 0
64 1-0 Ball 536158720 Top 6 Player C Player 1 0
65 2-0 Fly Out 536158720 Top 6 Player C Player 1 0
66 0-0 Strike Looking 536158720 Top 6 Player D Player 1 0
67 0-1 Foul 536158720 Top 6 Player D Player 1 0
68 0-2 Foul 536158720 Top 6 Player D Player 1 0
69 0-2 Single on a Ground Ball 536158720 Top 6 Player D Player 1 0
70 0-0 Ball 536158720 Top 6 Player E Player 1 0
71 1-0 Strike Looking 536158720 Top 6 Player E Player 1 0
72 1-1 Ball 536158720 Top 6 BPlayer E Player 1 0
73 2-1 Foul 536158720 Top 6 Player E Player 1 0
74 2-2 Foul 536158720 Top 6 Player E Player 1 0
75 2-2 Home Run on a 402.65 ft Fly Ball 536158720 Top 6 Player E Player 1 0
76 0-0 Ball 536158720 Top 6 Player F Player 1 0
77 1-0 Ball 536158720 Top 6 Player F Player 1 0
78 2-0 Strike Swinging 536158720 Top 6 Player F Player 1 0
79 2-1 Foul 536158720 Top 6 Player F Player 1 0
80 2-2 Fly Out 536158720 Top 6 Player F Player 1 0
81 0-0 Strike Swinging 536158720 Top 7 Player G Player 1 0
82 0-1 Line Out 536158720 Top 7 Player G Player 1 0
83 0-0 Fly Out 536158720 Top 7 Player H Player 2 0
84 0-0 Strike Looking 536158720 Top 7 Player I Player 2 0
85 0-1 Foul 536158720 Top 7 Player I Player 2 0
86 0-2 Foul 536158720 Top 7 Player I Player 2 0
87 0-2 Ball 536158720 Top 7 Player I Player 2 0
88 1-2 Ball 536158720 Top 7 Player I Player 2 0
89 2-2 Single on a Ground Ball 536158720 Top 7 Player I Player 2 0
90 0-0 Ball 536158720 Top 7 Player A Player 2 0
91 1-0 Strike Looking 536158720 Top 7 Player A Player 2 0
92 1-1 Fielder's Choice 536158720 Top 7 Player A Player 2 0
93 0-0 Ball 536158720 Top 8 Player B Player 2 0
94 1-0 Ground Out 536158720 Top 8 Player B Player 2 0
95 0-0 Foul 536158720 Top 8 Player C Player 2 0
96 0-1 Ball 536158720 Top 8 Player C Player 2 0
97 1-1 Strike Swinging 536158720 Top 8 Player C Player 2 0
98 1-2 Single on a Line Drive 536158720 Top 8 Player C Player 2 0
A similar approach like Abdur Rohman but with slightly different interpretation of your description:
library(dplyr)
library(stringr)
df[1:98,] %>%
group_by(batter, grp = cumsum(coalesce(batter != lag(batter), FALSE))) %>%
mutate(new_bb = +any(str_detect(pitchResult, "Walk|Hit"))) %>%
ungroup() %>%
select(-grp)
This returns
# A tibble: 98 x 8
count pitchResult gameId inn batter pitcher bb new_bb
<chr> <chr> <int> <chr> <chr> <chr> <dbl> <int>
1 0-0 Ball 536158720 Top 1 Player A Player 1 0 0
2 1-0 Ball 536158720 Top 1 Player A Player 1 0 0
3 2-0 Foul 536158720 Top 1 Player A Player 1 0 0
4 2-1 Ground Out 536158720 Top 1 Player A Player 1 0 0
5 0-0 Foul 536158720 Top 1 Player B Player 1 0 0
6 0-1 Strike Looking 536158720 Top 1 Player B Player 1 0 0
7 0-2 Ball 536158720 Top 1 Player B Player 1 0 0
8 1-2 Foul 536158720 Top 1 Player B Player 1 0 0
9 1-2 Ground Out 536158720 Top 1 Player B Player 1 0 0
10 0-0 Strike Looking 536158720 Top 1 Player C Player 1 0 0
...
48 0-1 Fly Out 536158720 Top 4 Player D Player 1 0 0
49 0-0 Double on a Fly Ball 536158720 Top 5 Player E Player 1 0 0
50 0-0 Hit By Pitch 536158720 Top 5 Player F Player 1 1 1
51 0-0 Ball 536158720 Top 5 Player G Player 1 0 0
52 1-0 Ball 536158720 Top 5 Player G Player 1 0 0
53 2-0 Fly Out 536158720 Top 5 Player G Player 1 0 0
54 0-0 Ball 536158720 Top 5 Player H Player 1 0 1
55 1-0 Ball 536158720 Top 5 Player H Player 1 0 1
56 2-0 Ball 536158720 Top 5 Player H Player 1 0 1
57 3-0 Walk 536158720 Top 5 Player H Player 1 1 1
58 0-0 Double Play 536158720 Top 5 Player I Player 1 0 0
59 0-0 Ball 536158720 Top 6 Player A Player 1 0 0
...
95 0-0 Foul 536158720 Top 8 Player C Player 2 0 0
96 0-1 Ball 536158720 Top 8 Player C Player 2 0 0
97 1-1 Strike Swinging 536158720 Top 8 Player C Player 2 0 0
98 1-2 Single on a Line Drive 536158720 Top 8 Player C Player 2 0 0
I don't know if the last batter of an inn (whatever this means) can be the new first batter of a new inn. In this case you need to include inn in the group_by statement.

R loop to iterate and find unique combination between each item

concept_id concept_name event
1: 443387 Malignant tumor of stomach comorb
2: 4193704 Type 2 diabetes mellitus without complication comorb
3: 4095320 Malignant tumor of body of stomach comorb
4: 201826 Type 2 diabetes mellitus comorb
5: 4174977 Retinopathy due to diabetes mellitus comorb
For the above data, I am trying to create a list of combinations for concept_ids. There are 5 concept ids so when we iterate each concept_id with another concept_id we get a list something like this.
nrow(comorb_event)
for (i in (1:nrow(comorb_event))) {
for (j in (1:nrow(comorb_event))){
print(paste(i,j))
}
}
[1] "1 1"
[1] "1 2"
[1] "1 3"
[1] "1 4"
[1] "1 5"
[1] "2 1"
[1] "2 2"
[1] "2 3"
[1] "2 4"
[1] "2 5"
[1] "3 1"
[1] "3 2"
[1] "3 3"
[1] "3 4"
[1] "3 5"
[1] "4 1"
[1] "4 2"
[1] "4 3"
[1] "4 4"
[1] "4 5"
[1] "5 1"
[1] "5 2"
[1] "5 3"
[1] "5 4"
[1] "5 5"
My output is not what I expect. Since item [1,1] are same items we can avoid that, and similarly item [2,1] is already covered by [1,2] we can remove that too. The expected list would be something like this after removing the redundant combinations:
[1] "1 2"
[1] "1 3"
[1] "1 4"
[1] "1 5"
[1] "2 3"
[1] "2 4"
[1] "2 5"
[1] "3 4"
[1] "3 5"
[1] "4 5"
Sample data
structure(list(concept_id = c("443387", "4193704", "4095320",
"201826", "4174977"), concept_name = c("Malignant tumor of stomach",
"Type 2 diabetes mellitus without complication", "Malignant tumor of body of stomach",
"Type 2 diabetes mellitus", "Retinopathy due to diabetes mellitus"
), event = structure(c(1L, 1L, 1L, 1L, 1L), .Label = c("comorb",
"drug", "primary_dx"), class = "factor")), class = c("data.table",
"data.frame"), row.names = c(NA, -5L), .internal.selfref = <pointer: 0x5642431689a0>)
We need combn
t(combn(seq_len(nrow(comorb_event)), 2))

In dplyr group_by() + summarise(sum)is not working

This is my code:
df <- structure(list(NOME = c("JOGADOR 1", "JOGADOR 1", "JOGADOR 6",
"JOGADOR 6", "JOGADOR 5", "JOGADOR 5", "JOGADOR 3", "JUGADOR 3",
"JOGADOR 9", "JOGADOR 9", "JOGADOR 7", "JOGADOR 7", "JOGADOR 8",
"JOGADOR 8", "JOGADOR 10", "JOGADOR 10", "JOGADOR 4", "JOGADOR 4",
"JOGADOR 2", "JOGADOR 2", "JOGADOR 12", "JOGADOR 11", "JOGADOR 13"
), TOTAL_MINUTES = c(48.15, 43, 48.15, 51.9333333333333, 48.15,
51.9333333333333, 48.15, 51.9333333333333, 48.15, 25, 48.15,
51.9333333333333, 48.15, 29, 48.15, 42, 48.15, 51.9333333333333,
48.15, 51.9333333333333, 17, 26, 9), TOTAL.DISTANCE = c(5264L,
3999L, 5242L, 5589L, 5684L, 5966L, 4833L, 5012L, 5013L, 2653L,
5452L, 5691L, 5041L, 3775L, 5266L, 4321L, 4795L, 4924L, 5209L,
5242L, 2085L, 2703L, 1282L)), row.names = c(NA, -23L), class = c("tbl_df",
"tbl", "data.frame"))
Its a simple task but its not working:
df %>%group_by(NOME) %>%
summarise(across(TOTAL_MINUTES:TOTAL.DISTANCE),sum())
It just reapting the NOME column values. Its not summing and giving one line per "JOGADOR X".
Why? Any help?
The across was closed without the sum. Also, if we are not providing any lambda expression, we don't use sum()
library(dplyr)
df %>%
group_by(NOME) %>%
summarise(across(TOTAL_MINUTES:TOTAL.DISTANCE, sum,
na.rm = TRUE), .groups = 'drop')
-output
# A tibble: 14 × 3
NOME TOTAL_MINUTES TOTAL.DISTANCE
<chr> <dbl> <int>
1 JOGADOR 1 91.2 9263
2 JOGADOR 10 90.2 9587
3 JOGADOR 11 26 2703
4 JOGADOR 12 17 2085
5 JOGADOR 13 9 1282
6 JOGADOR 2 100. 10451
7 JOGADOR 3 48.2 4833
8 JOGADOR 4 100. 9719
9 JOGADOR 5 100. 11650
10 JOGADOR 6 100. 10831
11 JOGADOR 7 100. 11143
12 JOGADOR 8 77.2 8816
13 JOGADOR 9 73.2 7666
14 JUGADOR 3 51.9 5012
Or using lambda expression
df %>%
group_by(NOME) %>%
summarise(across(TOTAL_MINUTES:TOTAL.DISTANCE, ~sum(.x,
na.rm = TRUE)), .groups = 'drop')

Renaming labels of a factor in R

I have census data of Male and Female populations organizaed by age group:
library(tidyverse)
url <- "https://www2.census.gov/programs-surveys/popest/datasets/2010-2018/counties/asrh/cc-est2018-alldata-54.csv"
if (!file.exists("./datafiles/cc-est2018-alldata-54.csv"))
download.file(url, destfile = "./datafiles/cc-est2018-alldata-54.csv", mode = "wb")
popSample <- read.csv("./datafiles/cc-est2018-alldata-54.csv") %>%
filter(AGEGRP != 0 & YEAR == 1) %>%
select("STNAME", "CTYNAME", "AGEGRP", "TOT_POP", "TOT_MALE", "TOT_FEMALE")
popSample$AGEGRP <- as.factor(popSample$AGEGRP)
I then plot the Male and Female population relationships, faceted by age group (1-18, which is currently treated as a int
g <- ggplot(popSample, aes(x=TOT_MALE, y=TOT_FEMALE)) +
geom_point(alpha = 0.5, colour="darkblue") +
scale_x_log10() +
scale_y_log10() +
facet_wrap(~AGEGRP) +
stat_smooth(method = "lm", col = "darkred", size=.75) +
labs(title = "F vs. M Population across all Age Groups", x = "Total Male (log10)", y = "Total Female (log10)") +
theme_light()
g
Which results in this plot: https://share.getcloudapp.com/v1ur6O4e
The problem: I am trying to convert the column AGEGRP from ‘int’ to ‘factor’, and change the factors labels from “1”, “2”, “3”, … “18” to "AgeGroup1", "AgeGroup2", "AgeGroup3", … "AgeGroup18"
When I try this code, my AGEGRP column's observation values are all replaced with NAs:popSample$AGEGRP <- factor(popSample$AGEGRP, levels = c("0 to 4", "5 to 9", "10 to 14", "15 to 19", "20 to 24", "25 to 29", "30 to 34", "35 to 39", "40 to 44", "45 to 49", "50 to 54", "55 to 59", "60 to 64", "65 to 69", "70 to 74", "75 to 79", "80 to 84", "85+"))
https://share.getcloudapp.com/qGuo1O4y
Thank you for your help,
popSample$AGEGRP <- factor( popSample$AGEGRP, levels = c("0 to 4", "5 to 9", "10 to 14", "15 to 19", "20 to 24", "25 to 29", "30 to 34", "35 to 39", "40 to 44", "45 to 49", "50 to 54", "55 to 59", "60 to 64", "65 to 69", "70 to 74", "75 to 79", "80 to 84", "85+"))
Need to add all levels though.
Alternatively
levels(popSample$AGEGRP) <- c("0 to 4", "5 to 9", "10 to 14", "15 to 19", "20 to 24", "25 to 29", "30 to 34", "35 to 39", "40 to 44", "45 to 49", "50 to 54", "55 to 59", "60 to 64", "65 to 69", "70 to 74", "75 to 79", "80 to 84", "85+")
should work as well.
Read in the csv again:
library(tidyverse)
url <- "https://www2.census.gov/programs-surveys/popest/datasets/2010-2018/counties/asrh/cc-est2018-alldata-54.csv"
popSample <- read.csv(url) %>%
filter(AGEGRP != 0 & YEAR == 1) %>%
select("STNAME", "CTYNAME", "AGEGRP", "TOT_POP", "TOT_MALE", "TOT_FEMALE")
If you just want to add a prefix "AgeGroup" to your facet labels, you do:
ggplot(popSample, aes(x=TOT_MALE, y=TOT_FEMALE)) +
geom_point(alpha = 0.5, colour="darkblue") +
scale_x_log10() +
scale_y_log10() +
facet_wrap(~AGEGRP,labeller=labeller(AGEGRP = function(i)paste0("AgeGroup",i))) +
stat_smooth(method = "lm", col = "darkred", size=.75) +
labs(title = "F vs. M Population across all Age Groups",
x = "Total Male (log10)", y = "Total Female (log10)") +
theme_light()
If there is a need for new factors, then you need to refactor (like #Annet's answer below):
lvls = c("0 to 4", "5 to 9", "10 to 14", "15 to 19",
"20 to 24", "25 to 29", "30 to 34", "35 to 39",
"40 to 44", "45 to 49", "50 to 54", "55 to 59",
"60 to 64", "65 to 69", "70 to 74", "75 to 79", "80 to 84", "85+")
#because you have factorize it
# if you can read the csv again, skip the factorization
popSample$AGEGRP = factor(lvls[popSample$AGEGRP],levels=lvls)
Then plot:
ggplot(popSample, aes(x=TOT_MALE, y=TOT_FEMALE)) +
geom_point(alpha = 0.5, colour="darkblue") +
scale_x_log10() +
scale_y_log10() +
facet_wrap(~AGEGRP) +
stat_smooth(method = "lm", col = "darkred", size=.75) +
labs(title = "F vs. M Population across all Age Groups",
x = "Total Male (log10)", y = "Total Female (log10)") +
theme_light()
To change all the factor labels with one function, you can use forcats::fct_relabel (forcats ships as part of the tidyverse, which you've already got loaded). The changed factor labels will carry over to the plot facets and the order stays the same.
First few entries:
# before relabelling
popSample$AGEGRP[1:4]
#> [1] 1 2 3 4
#> Levels: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
# after relabelling
forcats::fct_relabel(popSample$AGEGRP, ~paste0("AgeGroup", .))[1:4]
#> [1] AgeGroup1 AgeGroup2 AgeGroup3 AgeGroup4
#> 18 Levels: AgeGroup1 AgeGroup2 AgeGroup3 AgeGroup4 AgeGroup5 ... AgeGroup18
Or with base R, reassign the levels:
levels(popSample$AGEGRP) <- paste0("AgeGroup", levels(popSample$AGEGRP))
popSample$AGEGRP[1:4]
#> [1] AgeGroup1 AgeGroup2 AgeGroup3 AgeGroup4
#> 18 Levels: AgeGroup1 AgeGroup2 AgeGroup3 AgeGroup4 AgeGroup5 ... AgeGroup18

selecting a subset of data based on another column

I have a dataset which looks something like this:
Area Num
[1,] "Area 1" "99"
[2,] "Area 3" "85"
[3,] "Area 1" "60"
[4,] "Area 2" "90"
[5,] "Area 1" "40"
[6,] "Area 3" NA
[7,] "Area 4" "10"
...
code:
structure(c("Area 1", "Area 3", "Area 1", "Area 2", "Area 1",
"Area 3", "Area 4", "99", "85", "60", "90", "40", NA, "10"), .Dim = c(7L,
2L), .Dimnames = list(NULL, c("Area", "Num")))
I need to do some calculation on values in Num for each Area, for example calculating the sum of each Area, or the summary of each Area.
I'm thinking of using a nested for loop to achieve this, but I'm not sure how to.
You can do this using aggregate, but the dplyr package makes it very easy to work with such problems. There are plenty of duplicates of this question, though.
library(dplyr)
df <- structure(c("Area 1", "Area 3", "Area 1", "Area 2", "Area 1",
"Area 3", "Area 4", "99", "85", "60", "90", "40", NA, "10"), .Dim = c(7L,
2L), .Dimnames = list(NULL, c("Area", "Num")))
df <- data.frame(df)
df$Num <- as.numeric(df$Num)
df2 <- df %>%
group_by(Area) %>%
summarise(totalNum = sum(Num, na.rm=T))
df2
In order to apply the function to every level of the factor, we can recurse to the by function:
dt <- structure(c("Area 1", "Area 3", "Area 1", "Area 2", "Area 1",
"Area 3", "Area 4", "99", "85", "60", "90", "40", NA, "10"), .Dim = c(7L, 2L), .Dimnames = list(NULL, c("Area", "Num")))
dt <- data.frame(dt)
dt$Num <- as.numeric(dt$Num)
t <- by(dt$Num, dt$Area, sum)
t
Doing the same thing using data.table
library(data.table)
dt <- data.table(df)
dt[,sum(as.numeric(Num),na.rm=T),by=Area]
## Area V1
## 1: Area 1 199
## 2: Area 3 85
## 3: Area 2 90
## 4: Area 4 10

Resources