How to create an unique observation ID using hash functions? - r

I have received an data frame for analysis, each observation is a row, with 120 variables. Unfortunately I have not received an observation ID variable that uniquely identifies each observations.
I was thinking maybe I could concatenate all columns to a string and hash this string to obtain a unique ID.
How can I do this without specifying all variables like with paste(). Or is there another solution?
The data can contain NA
here is the sample dataset
structure(list(Class = structure(c(1L, 2L, 3L, 4L, 1L, 2L, 3L,
4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L,
4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L), levels = c("1st", "2nd",
"3rd", "Crew"), class = "factor"), Sex = structure(c(1L, 1L,
1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L,
1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L), levels = c("Male",
"Female"), class = "factor"), Age = structure(c(1L, NA, 1L, NA,
1L, NA, 1L, 1L, 2L, 2L, NA, 2L, 2L, 2L, 2L, NA, 1L, 1L, 1L, NA,
NA, 1L, 1L, 1L, NA, 2L, 2L, 2L, 2L, 2L, 2L, NA), levels = c("Child",
"Adult"), class = "factor"), Survived = structure(c(1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), levels = c("No",
"Yes"), class = "factor"), Freq = c(0, 0, 35, 0, 0, 0, 17, 0,
118, 154, 387, 670, 4, 13, 89, 3, 5, 11, 13, 0, 1, 13, 14, 0,
57, 14, 75, 192, 140, 80, 76, 20)), row.names = c(NA, -32L), class = "data.frame")

Maybe you want to use the unique_identifier function from the udpipe package which does:
Create a unique identifier for each combination of fields in a data
frame. This unique identifier is unique for each combination of the
elements of the fields. The generated identifier is like a primary key
or a secondary key on a table. This is just a small wrapper around
frank
Here reproducible example:
df <- structure(list(Class = structure(c(1L, 2L, 3L, 4L, 1L, 2L, 3L,
4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L,
4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L), levels = c("1st", "2nd",
"3rd", "Crew"), class = "factor"), Sex = structure(c(1L, 1L,
1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L,
1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L), levels = c("Male",
"Female"), class = "factor"), Age = structure(c(1L, NA, 1L, NA,
1L, NA, 1L, 1L, 2L, 2L, NA, 2L, 2L, 2L, 2L, NA, 1L, 1L, 1L, NA,
NA, 1L, 1L, 1L, NA, 2L, 2L, 2L, 2L, 2L, 2L, NA), levels = c("Child",
"Adult"), class = "factor"), Survived = structure(c(1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), levels = c("No",
"Yes"), class = "factor"), Freq = c(0, 0, 35, 0, 0, 0, 17, 0,
118, 154, 387, 670, 4, 13, 89, 3, 5, 11, 13, 0, 1, 13, 14, 0,
57, 14, 75, 192, 140, 80, 76, 20)), row.names = c(NA, -32L), class = "data.frame")
library(udpipe)
#> Warning: package 'udpipe' was built under R version 4.1.2
df$ID <- unique_identifier(df, fields = colnames(df))
df
#> Class Sex Age Survived Freq ID
#> 1 1st Male Child No 0 1
#> 2 2nd Male <NA> No 0 12
#> 3 3rd Male Child No 35 17
#> 4 Crew Male <NA> No 0 27
#> 5 1st Female Child No 0 5
#> 6 2nd Female <NA> No 0 16
#> 7 3rd Female Child No 17 21
#> 8 Crew Female Child No 0 29
#> 9 1st Male Adult No 118 3
#> 10 2nd Male Adult No 154 10
#> 11 3rd Male <NA> No 387 20
#> 12 Crew Male Adult No 670 25
#> 13 1st Female Adult No 4 6
#> 14 2nd Female Adult No 13 14
#> 15 3rd Female Adult No 89 23
#> 16 Crew Female <NA> No 3 31
#> 17 1st Male Child Yes 5 2
#> 18 2nd Male Child Yes 11 9
#> 19 3rd Male Child Yes 13 18
#> 20 Crew Male <NA> Yes 0 28
#> 21 1st Female <NA> Yes 1 8
#> 22 2nd Female Child Yes 13 13
#> 23 3rd Female Child Yes 14 22
#> 24 Crew Female Child Yes 0 30
#> 25 1st Male <NA> Yes 57 4
#> 26 2nd Male Adult Yes 14 11
#> 27 3rd Male Adult Yes 75 19
#> 28 Crew Male Adult Yes 192 26
#> 29 1st Female Adult Yes 140 7
#> 30 2nd Female Adult Yes 80 15
#> 31 3rd Female Adult Yes 76 24
#> 32 Crew Female <NA> Yes 20 32
Created on 2022-07-24 by the reprex package (v2.0.1)

Another option is to use unclass on factors (i.e., after pasting all columns together using Reduce), which will convert the factors to their numbers.
df$ID <- c(unclass(as.factor(Reduce(paste, df))))
Output
Class Sex Age Survived Freq ID
1 1st Male Child No 0 6
2 2nd Male <NA> No 0 16
3 3rd Male Child No 35 22
4 Crew Male <NA> No 0 31
5 1st Female Child No 0 3
6 2nd Female <NA> No 0 12
7 3rd Female Child No 17 19
8 Crew Female Child No 0 25
9 1st Male Adult No 118 5
10 2nd Male Adult No 154 13
11 3rd Male <NA> No 387 24
12 Crew Male Adult No 670 29
13 1st Female Adult No 4 1
14 2nd Female Adult No 13 9
15 3rd Female Adult No 89 17
16 Crew Female <NA> No 3 27
17 1st Male Child Yes 5 7
18 2nd Male Child Yes 11 15
19 3rd Male Child Yes 13 23
20 Crew Male <NA> Yes 0 32
21 1st Female <NA> Yes 1 4
22 2nd Female Child Yes 13 11
23 3rd Female Child Yes 14 20
24 Crew Female Child Yes 0 26
25 1st Male <NA> Yes 57 8
26 2nd Male Adult Yes 14 14
27 3rd Male Adult Yes 75 21
28 Crew Male Adult Yes 192 30
29 1st Female Adult Yes 140 2
30 2nd Female Adult Yes 80 10
31 3rd Female Adult Yes 76 18
32 Crew Female <NA> Yes 20 28

Related

How to loop over a grouped dataframe using dplyr::count() and purrr::map()

Background
I want to loop over a grouped dataframe of factor variables to count the
occurrences of each value within a variable using count function from dplyr,
and I think that the purrr::map function would be the most suitable.
However, I cannot get this to work.
I tried to use this post for my needs, but this did not work either.
I also tried to hack together a function based on
this post, but could not get this to work with the grouping variable.
Question
Is it possible to loop over a grouped dataframe in the way that I want? If so,
how?
Thanks in advance for your consideration.
Reproducible example
library(tidyverse)
vars_df <-
structure(list(c = structure(c(2L, 3L, 3L, 2L, 3L, 3L, 2L, 2L,
1L, 2L, 2L, 2L, 3L, 1L, 3L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L,
2L, 3L, 1L, 2L, 1L, 3L, 3L, 2L, 1L, 2L, 2L, 2L, 3L, 3L, 2L, 1L,
1L, 2L, 3L, 3L, 2L, 2L, 3L, 3L, 3L, 2L), .Label = c("1", "2",
"3"), class = "factor"), pastpsyc = structure(c(2L, 1L, NA, 2L,
1L, 2L, 2L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 1L,
1L, 2L, 2L, 1L, 2L, NA, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 2L,
2L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 2L), .Label = c("0",
"1"), class = "factor"), pastmed = structure(c(2L, 1L, NA, 2L,
1L, 1L, 1L, 1L, NA, 2L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 1L,
2L, 2L, 1L, 1L, 1L, NA, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 2L), .Label = c("0",
"1"), class = "factor"), hxsuicide = structure(c(2L, 1L, NA,
2L, 2L, 1L, 2L, 2L, 1L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L,
1L, 2L, 2L, 1L, 1L, 2L, NA, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 2L, 2L,
2L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 1L, 2L, 2L, 2L), .Label = c("0",
"1"), class = "factor"), hxdsh = structure(c(2L, 1L, NA, 1L,
2L, 1L, 2L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L,
2L, 2L, 2L, 1L, 2L, NA, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 2L,
2L, 2L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 2L), .Label = c("0",
"1"), class = "factor"), hxtrauma = structure(c(2L, 1L, NA, 2L,
1L, 1L, 1L, 1L, NA, 1L, 1L, NA, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 2L,
2L, 1L, 2L, 1L, 1L, NA, 2L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 2L,
1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L), .Label = c("0",
"1"), class = "factor")), row.names = c(NA, -50L), class = c("tbl_df",
"tbl", "data.frame"))
map_dfr(setNames(c('1', '2', '3'),
c('1', '2', '3')), ~
vars_df %>%
group_by(c) %>%
summarise(across(everything(), function(x)
sum(x == .x, na.rm = TRUE)), .groups = 'drop'), .id = 'var')
#> # A tibble: 9 x 7
#> var c pastpsyc pastmed hxsuicide hxdsh hxtrauma
#> <chr> <fct> <int> <int> <int> <int> <int>
#> 1 1 1 3 2 2 5 1
#> 2 1 2 16 9 18 16 10
#> 3 1 3 12 3 8 11 9
#> 4 2 1 0 0 0 0 0
#> 5 2 2 0 0 0 0 0
#> 6 2 3 0 0 0 0 0
#> 7 3 1 0 0 0 0 0
#> 8 3 2 0 0 0 0 0
#> 9 3 3 0 0 0 0 0
vars_df %>%
group_by(c) %>%
count(pastpsyc)
#> # A tibble: 7 x 3
#> # Groups: c [3]
#> c pastpsyc n
#> <fct> <fct> <int>
#> 1 1 0 4
#> 2 1 1 3
#> 3 2 0 8
#> 4 2 1 16
#> 5 3 0 5
#> 6 3 1 12
#> 7 3 <NA> 2
vars_df %>%
group_by(c) %>%
map(~ count(.))
#> Error in UseMethod("count"): no applicable method for 'count' applied to an object of class "factor"
.get_count <-
function(mygroup) {
quo_var <- enquo(mygroup)
vars_df %>%
group_by(!! quo_var) %>%
count() %>%
ungroup()
}
vars <-
vars_df %>%
colnames()
vars %>%
syms() %>%
map(function(var) .get_count(!!var))
#> [[1]]
#> # A tibble: 3 x 2
#> c n
#> <fct> <int>
#> 1 1 7
#> 2 2 24
#> 3 3 19
#>
#> [[2]]
#> # A tibble: 3 x 2
#> pastpsyc n
#> <fct> <int>
#> 1 0 17
#> 2 1 31
#> 3 <NA> 2
#>
#> [[3]]
#> # A tibble: 3 x 2
#> pastmed n
#> <fct> <int>
#> 1 0 33
#> 2 1 14
#> 3 <NA> 3
#>
#> [[4]]
#> # A tibble: 3 x 2
#> hxsuicide n
#> <fct> <int>
#> 1 0 20
#> 2 1 28
#> 3 <NA> 2
#>
#> [[5]]
#> # A tibble: 3 x 2
#> hxdsh n
#> <fct> <int>
#> 1 0 16
#> 2 1 32
#> 3 <NA> 2
#>
#> [[6]]
#> # A tibble: 3 x 2
#> hxtrauma n
#> <fct> <int>
#> 1 0 26
#> 2 1 20
#> 3 <NA> 4
vars %>%
syms() %>%
group_by(c) %>%
map(function(var) .get_count(!!var))
#> Error in UseMethod("group_by"): no applicable method for 'group_by' applied to an object of class "list"
# Created on 2021-05-26 by the reprex package (v2.0.0)
You can use map as -
library(tidyverse)
vars %>% map(~vars_df %>% count(c, .data[[.x]]))
#[[1]]
# A tibble: 3 x 2
# c n
# <fct> <int>
#1 1 7
#2 2 24
3 3 19
#[[2]]
# A tibble: 7 x 3
# c pastpsyc n
# <fct> <fct> <int>
#1 1 0 4
#2 1 1 3
#3 2 0 8
#4 2 1 16
#5 3 0 5
#6 3 1 12
#7 3 NA 2
#...
#...
A different way to show the output in a long format -
vars_df %>% pivot_longer(cols = -c) %>% count(c, name, value)
# c name value n
# <fct> <chr> <fct> <int>
# 1 1 hxdsh 0 2
# 2 1 hxdsh 1 5
# 3 1 hxsuicide 0 5
# 4 1 hxsuicide 1 2
# 5 1 hxtrauma 0 5
# 6 1 hxtrauma 1 1
# 7 1 hxtrauma NA 1
# 8 1 pastmed 0 4
# 9 1 pastmed 1 2
#10 1 pastmed NA 1
# … with 28 more rows

How can I do this standard excel operation in R?

I want to identify who are default and voluntary members in an Insurance database. Default members are ones with a certain number of units depending on their age. Voluntary members are any members with more units than default members at that age.
I want to create a column in R that says either "Default" or "Voluntary"
I have a table of the number of units a default member has. For example:
Age Units
18 2
19 2
20 2
21 2
22 2
23 2
24 2
25 3
26 3
27 3
28 3
29 3
30 3
31 4
32 4
33 4
34 4
35 4
36 4
37 4
38 4
39 4
40 4
41 4
42 4
43 4
44 4
45 4
46 4
47 4
48 4
49 4
50 3
51 3
52 3
53 3
54 3
55 3
56 3
57 3
58 3
59 3
60 2
61 2
62 2
63 2
64 2
65 1
66 1
67 1
68 1
69 1
I would usually do this in excel by vlookup-ing the member's number of units and if it equals the default number of units from above table I would say they are default and if not non default.
This is how I would achieve in excel
if( MembersUnits = vlookup(memberage,defaultunitstable,2,0),"Default", "Voluntary")
I expect out put to be "Default" or "Voluntary"
Using the data you supplied as a lookup table, I created data of person age and the number of units they have, joined the threshold values from lookup and compared the values with ifelse:
library(dplyr)
lookup <- structure(list(Age = 18:69,
Units = c(2L, 2L, 2L, 2L, 2L, 2L,
2L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L)),
row.names = c(NA,
-52L), class = c("tbl_df", "tbl", "data.frame"))
dat <- tibble(Age = c(50, 50, 49, 32, 18), Units = c(3, 5, 5, 4, 3))
left_join(dat, rename(lookup, "Threshold" = "Units"), by = "Age") %>%
mutate(member = ifelse(Units == Threshold, "Default", "Voluntary"))
# A tibble: 5 x 4
Age Units Threshold member
<dbl> <dbl> <int> <chr>
1 50 3 3 Default
2 50 5 3 Voluntary
3 49 5 4 Voluntary
4 32 4 4 Default
5 18 3 2 Voluntary
if (!require("prodlim")) {
install.packages("prodlim")
require("prodlim")
} # ensure installation and loading of package "prodlim"
ifelse(is.na(row.match(as.data.frame(dat), as.data.frame(lookup))),
"Voluntary",
"Default")
## [1] "Default" "Voluntary" "Voluntary" "Default" "Default" "Default"
## the function
## prodlim::row.match(as.data.frame(dat), as.data.frame(lookup))
## returns for each row in dat,
## the matching row number in lookup or
## NA if there is no match
##
## This resulting vector one can use to translate any non-NA to "Default" and
## any NA to "Voluntary" using the vectorized `ifelse`
Ah I used as example data following #Paul:
require(dplyr)
dat <- tibble(Age = c(50, 50, 49, 26, 32, 18), Units = c(3, 5, 5, 3, 4, 2))
lookup <- structure(list(Age = 18:69,
Units = c(2L, 2L, 2L, 2L, 2L, 2L,
2L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L)),
row.names = c(NA,
-52L), class = c("tbl_df", "tbl", "data.frame"))

R ggplot bar graph has extra lines at the base of columns

I have two main issues I could use some help getting resolved.
1.) There are odd lines at the base of my columns which I am not sure how to get rid of.
2.) I am running into overlap with the columns when I graph. (I think this has something do do with the position_dodge(width= XXX) but not totally sure).
Attached an image of an example plot, mainly because I am not sure how to describe what is happening at the base of the plot.
The following code is being used.
where_2 <- where %>%
group_by_("gender", "radio") %>%
summarise(count = n()) %>%
mutate(perc = (perc = (count / sum(count) * 100)))
gg <- ggplot(where_2, aes_string(x = names(where_2[1]), y = where_2$perc, fill = "radio"))
gg <- gg + geom_bar(aes(y = (..count..) / sum(..count..)))
gg <-gg + geom_bar(position = position_dodge(.5),stat = "identity", width = .75)
#gg <- gg + scale_y_continuous(labels = scales::percent)
gg <- gg + xlab(paste0(lab5[2, title]))
gg <- gg + scale_fill_discrete(labels = c("Yes", "No"))
print(gg)
I have been running in a wall for the past 4 days with this question any help would be appreciated.
place gender Radio
1 Male No
1 Female Yes
1 Male No
1 Female Yes
1 Male Yes
1 Male Yes
1 Female Yes
1 Female Yes
1 Male Yes
1 Female No
1 Male Yes
1 Male Yes
1 Male No
1 Female No
1 Female Yes
1 Female Yes
1 Female No
1 Male Yes
1 Female No
1 Female Yes
1 Female No
1 Female Yes
1 Male No
1 Male No
1 Female No
1 Male No
1 Female No
1 Female No
1 Female No
1 Male Yes
1 Female No
1 Female No
1 Female Yes
1 Male No
1 Male Yes
1 Female No
2 Male Yes
2 Male Yes
2 Female No
2 Female No
2 Male Yes
2 Female No
2 Male No
2 Male Yes
2 Female No
2 Female No
2 Female No
2 Male No
2 Female No
2 Male No
2 Female Yes
2 Female Yes
2 Male Yes
2 Male No
2 Male Yes
3 Female No
3 Male Yes
3 Female No
3 Male No
3 Male Yes
3 Female No
3 Female Yes
3 Male No
3 Male Yes
3 Female Yes
3 Male No
3 Female No
3 Female Yes
3 Female No
3 Female Yes
3 Female No
3 Male Yes
3 Female No
3 Female No
4 Male Yes
4 Female No
4 Female Yes
4 Female Yes
4 Male Yes
4 Female No
4 Female No
4 Male No
4 Female No
4 Female No
4 Female No
4 Male Yes
4 Male Yes
4 Female Yes
4 Female No
4 Male Yes
4 Male Yes
4 Male Yes
4 Female No
4 Female No
4 Female No
Try this:
gg <- ggplot(where2,
aes(x = gender, y = perc, fill = Radio)) +
geom_col(position = "dodge", width = .75)
print(gg)
Explanation below:
You are right that the "feet" are indeed caused by geom_bar(aes(y = (..count..) / sum(..count..))). I'm not sure why you included it in the first place, but here's why it created the "feet":
Good chart
p <- ggplot(where2, aes(x = gender, y = perc, fill = Radio))
p + geom_col(position = position_dodge(0.5), width = 0.75)
Above is the chart you want to get (I assume). geom_col() is equivalent to geom_bar(stat = "identity") with less typing, so I used that instead.
Usually people set the same value in position_dodge() and width =, which would avoid the overlapped look. I've retained it for now to contrast with the "feet" below.
Notice also the values on the y-axis. They range from 0 to 60+.
Bad chart
p + geom_bar(aes(y = (..count..) / sum(..count..)))
Above is the chart of the "feet", now occupying the entire plot's height. Here, ..count.. returns the number of rows for each combination of gender & Radio, while sum(..count..) returns the total number of rows in the data frame. The data frame, where2, has 4 rows, one for each combination, so the y value associated with each bar is 0.25, and the stacked height of each gender's two bars is 0.5.
I consider this the bad chart, because the visualisation is useless. When you have already counted the number of rows in your dataset yourself (going from where to where2), it's not necessary for ggplot to do it again.
Good chart + bad chart = weird chart
p +
geom_col(position = position_dodge(0.5), width = 0.75) +
geom_bar(aes(y = (..count..) / sum(..count..)))
Above is the combined chart with both layers. Now the bad chart's bars are squeezed all the way to the bottom, since their combined height is only 0.5, while the good chart's bars stretch all the way to 60+.
data used:
> dput(where)
structure(list(place = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L), gender = structure(c(2L, 1L, 2L, 1L,
2L, 2L, 1L, 1L, 2L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L,
1L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 2L, 1L,
2L, 2L, 1L, 1L, 2L, 1L, 2L, 2L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 1L,
2L, 2L, 2L, 1L, 2L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 1L,
1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L,
1L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L), .Label = c("Female",
"Male"), class = "factor"), Radio = structure(c(1L, 2L, 1L, 2L,
2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 2L,
1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 1L,
2L, 2L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 1L, 1L, 2L,
1L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L,
1L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 1L, 1L, 1L), .Label = c("No",
"Yes"), class = "factor")), .Names = c("place", "gender", "Radio"
), class = "data.frame", row.names = c(NA, -95L))
where2 <- where %>%
group_by(gender, Radio) %>%
summarise(count = n()) %>%
mutate(perc = (perc = (count / sum(count) * 100)))
> where2
# A tibble: 4 x 4
# Groups: gender [2]
gender Radio count perc
<fctr> <fctr> <int> <dbl>
1 Female No 37 67.3
2 Female Yes 18 32.7
3 Male No 15 37.5
4 Male Yes 25 62.5

How to aggregate data in R by a column containing strings? [duplicate]

I have asked this question earlier and received a reply which was not in accordance with my wish. At the time I used stata to do the job. However as I routinely work with such data, I wish to use R to create what I wanted. I have a data set of daily hospital admission by age, sex and diagnoses. I wish to aggregate and reshape the data from long to wide. How could I achieve this objective? Sample data and required output are shown below. The column headers designate prefix of sex, age and diagnoses.
Thanks
Sample data
structure(list(diag = structure(c(1L, 1L, 1L, 1L, 2L, 2L, 2L,
2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L,
2L), .Label = c("card", "cere"), class = "factor"), sex = structure(c(1L,
1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L,
1L, 2L, 2L, 1L, 1L, 2L, 2L), .Label = c("Female", "Male"), class = "factor"),
age = structure(c(1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L,
1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L), .Label = c("35-64",
"65-74"), class = "factor"), admissions = c(1L, 1L, 0L, 0L,
6L, 6L, 6L, 1L, 4L, 0L, 0L, 0L, 4L, 6L, 5L, 2L, 2L, 4L, 1L,
0L, 6L, 5L, 6L, 4L), bdate = structure(c(1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L), .Label = c("1987-01-01", "1987-01-02",
"1987-01-03"), class = "factor")), .Names = c("diag", "sex",
"age", "admissions", "bdate"), row.names = c(NA, -24L), class = "data.frame")
Required output
structure(list(date = structure(1:3, .Label = c("01jan1987",
"02jan1987", "03jan1987"), class = "factor"), f3564card = c(1L,
4L, 2L), f6574card = c(1L, 0L, 4L), m3564card = c(0L, 0L, 1L),
m6574card = c(0L, 0L, 0L), f3564cere = c(6L, 4L, 6L), f6574cere = c(6L,
6L, 5L), m3564cere = c(6L, 5L, 6L), m6574cere = c(1L, 2L,
4L)), .Names = c("date", "f3564card", "f6574card", "m3564card",
"m6574card", "f3564cere", "f6574cere", "m3564cere", "m6574cere"
), class = "data.frame", row.names = c(NA, -3L))
Your data are already in a long format that can be used easily by "reshape2", like this:
library(reshape)
dcast(df, bdate ~ sex + age + diag, value.var = "admissions")
# bdate Female_35-64_card Female_35-64_cere Female_65-74_card Female_65-74_cere
# 1 1987-01-01 1 6 1 6
# 2 1987-01-02 4 4 0 6
# 3 1987-01-03 2 6 4 5
# Male_35-64_card Male_35-64_cere Male_65-74_card Male_65-74_cere
# 1 0 6 0 1
# 2 0 5 0 2
# 3 1 6 0 4
I don't see any aggregation in your sample output, but if aggregation is required, you can achieve this with the fun.aggregate function within dcast.
df <- read.table("D:/Programacao/R/Stackoverflow/Nova pasta/sample.csv",
head = T, dec = '.', sep = ',',
stringsAsFactors = F)
head(df)
date sex cvd ACS age
1 01 Jul 91 female 0 0 35-64
2 01 Jul 91 male 0 0 35-64
3 01 Jul 91 female 0 0 35-64
4 01 Jul 91 male 1 1 35-64
5 01 Jul 91 female 0 0 65-74
6 02 Jul 91 male 0 0 65-74
Consdering that cvd and ACS are not mutually exclusive to males and females respectively,
library(dplyr)
df %.%
group_by(date, sex, age) %.%
summarise(vcvd = sum(cvd),
vacs = sum(ACS))
Source: local data frame [111 x 5]
Groups: date, sex
date sex age vcvd vacs
1 01 Jul 91 female 35-64 0 0
2 01 Jul 91 female 65-74 0 0
3 01 Jul 91 male 35-64 1 1
4 02 Aug 91 female 35-64 0 0
5 02 Jul 91 female 65-74 1 0
6 02 Jul 91 male 65-74 0 0
7 03 Aug 91 female 65-74 0 0
8 03 Jul 91 female 35-64 0 0
9 04 Jul 91 male 35-64 1 0
10 04 Jul 91 male 65-74 0 0
.. ... ... ... ... ...

Aggregate and reshape from long to wide

I have asked this question earlier and received a reply which was not in accordance with my wish. At the time I used stata to do the job. However as I routinely work with such data, I wish to use R to create what I wanted. I have a data set of daily hospital admission by age, sex and diagnoses. I wish to aggregate and reshape the data from long to wide. How could I achieve this objective? Sample data and required output are shown below. The column headers designate prefix of sex, age and diagnoses.
Thanks
Sample data
structure(list(diag = structure(c(1L, 1L, 1L, 1L, 2L, 2L, 2L,
2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L,
2L), .Label = c("card", "cere"), class = "factor"), sex = structure(c(1L,
1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L,
1L, 2L, 2L, 1L, 1L, 2L, 2L), .Label = c("Female", "Male"), class = "factor"),
age = structure(c(1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L,
1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L), .Label = c("35-64",
"65-74"), class = "factor"), admissions = c(1L, 1L, 0L, 0L,
6L, 6L, 6L, 1L, 4L, 0L, 0L, 0L, 4L, 6L, 5L, 2L, 2L, 4L, 1L,
0L, 6L, 5L, 6L, 4L), bdate = structure(c(1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L), .Label = c("1987-01-01", "1987-01-02",
"1987-01-03"), class = "factor")), .Names = c("diag", "sex",
"age", "admissions", "bdate"), row.names = c(NA, -24L), class = "data.frame")
Required output
structure(list(date = structure(1:3, .Label = c("01jan1987",
"02jan1987", "03jan1987"), class = "factor"), f3564card = c(1L,
4L, 2L), f6574card = c(1L, 0L, 4L), m3564card = c(0L, 0L, 1L),
m6574card = c(0L, 0L, 0L), f3564cere = c(6L, 4L, 6L), f6574cere = c(6L,
6L, 5L), m3564cere = c(6L, 5L, 6L), m6574cere = c(1L, 2L,
4L)), .Names = c("date", "f3564card", "f6574card", "m3564card",
"m6574card", "f3564cere", "f6574cere", "m3564cere", "m6574cere"
), class = "data.frame", row.names = c(NA, -3L))
Your data are already in a long format that can be used easily by "reshape2", like this:
library(reshape)
dcast(df, bdate ~ sex + age + diag, value.var = "admissions")
# bdate Female_35-64_card Female_35-64_cere Female_65-74_card Female_65-74_cere
# 1 1987-01-01 1 6 1 6
# 2 1987-01-02 4 4 0 6
# 3 1987-01-03 2 6 4 5
# Male_35-64_card Male_35-64_cere Male_65-74_card Male_65-74_cere
# 1 0 6 0 1
# 2 0 5 0 2
# 3 1 6 0 4
I don't see any aggregation in your sample output, but if aggregation is required, you can achieve this with the fun.aggregate function within dcast.
df <- read.table("D:/Programacao/R/Stackoverflow/Nova pasta/sample.csv",
head = T, dec = '.', sep = ',',
stringsAsFactors = F)
head(df)
date sex cvd ACS age
1 01 Jul 91 female 0 0 35-64
2 01 Jul 91 male 0 0 35-64
3 01 Jul 91 female 0 0 35-64
4 01 Jul 91 male 1 1 35-64
5 01 Jul 91 female 0 0 65-74
6 02 Jul 91 male 0 0 65-74
Consdering that cvd and ACS are not mutually exclusive to males and females respectively,
library(dplyr)
df %.%
group_by(date, sex, age) %.%
summarise(vcvd = sum(cvd),
vacs = sum(ACS))
Source: local data frame [111 x 5]
Groups: date, sex
date sex age vcvd vacs
1 01 Jul 91 female 35-64 0 0
2 01 Jul 91 female 65-74 0 0
3 01 Jul 91 male 35-64 1 1
4 02 Aug 91 female 35-64 0 0
5 02 Jul 91 female 65-74 1 0
6 02 Jul 91 male 65-74 0 0
7 03 Aug 91 female 65-74 0 0
8 03 Jul 91 female 35-64 0 0
9 04 Jul 91 male 35-64 1 0
10 04 Jul 91 male 65-74 0 0
.. ... ... ... ... ...

Resources