Generate Data Frame from Count Data - r

I am trying to create an unsummarized data frame from a data frame of count data.
I have had some experience creating sample datasets but I am having some trouble trying to get a specific number of rows and proportion for each state/person without coding each of them separately and then combining them. I was able to do it using the following code but I feel like there is a better way.
set.seed(2312)
dragon <- sample(c(1),3,replace=TRUE)
Maine <- sample(c("Maine"),3,replace=TRUE)
Maine1 <- data.frame(dragon, Maine)
dragon <- sample(c(0),20,replace=TRUE)
Maine <- sample(c("Maine"),20,replace=TRUE)
Maine2 <- data.frame(dragon, Maine)
Maine2
library(dplyr)
maine3 <- bind_rows(Maine1, Maine2)
Is there a better way to generate this dataset then the code above?
I am trying to create a data frame from the following count data:
+-------------+--------------+--------------+
| | # of dragons | # no dragons |
+-------------+--------------+--------------+
| Maine | 3 | 20|
| California | 1 | 10|
| Jocko | 28 | 110515 |
| Jessica Day | 17 | 26122 |
| | 14 | 19655 |
+-------------+--------------+--------------+
And I would like it to look like this:
+-----------------------+---------------+
| | Dragons (1/0) |
+-----------------------+---------------+
| Maine | 1 |
| Maine | 1 |
| Maine | 1 |
| Maine | 0 |
| Maine….(2:20) | 0…. |
| California | 1 |
| California….(2:10) | 0… |
| Ect.. | |
+-----------------------+---------------+
I do not want the code written for me but would love with ideas on function or examples that you think might be helpful.

I am not completely sure what does sampling have to do with this problem?
It looks to me like you are looking for untable.
Here is an example
data:
set.seed(1)
no_drag = sample(1:5, 5)
drag = sample(15:25, 5)
df <- data.frame(names = LETTERS[1:5],
drag,
no_drag)
names drag no_drag
1 A 24 2
2 B 25 5
3 C 20 4
4 D 23 3
5 E 15 1
library(reshape)
library(tidyverse)
df %>%
gather(key, value, 2:3) %>% #convert to long format
{untable(.,num = .$value)} %>% #untable by value column
mutate(value = ifelse(key == "drag", 0, 1)) %>% #convert values to 0/1
select(-key) %>% #remove unwanted column
arrange(names) #optional
#part of output
names value
1 A 0
2 A 0
3 A 0
4 A 0
5 A 0
6 A 0
7 A 0
8 A 0
9 A 0
10 A 0
11 A 0
12 A 0
13 A 0
14 A 0
15 A 0
16 A 0
17 A 0
18 A 0
19 A 0
20 A 0
21 A 0
22 A 0
23 A 0
24 A 0
25 A 1
26 A 1
27 B 0
28 B 0
29 B 0
30 B 0
there are other ways to tackle the problem here is one:
One is like #Frank mentioned in the comment:
df %>%
gather(key, val, 2:3) %>%
mutate(v = Map(rep, key == "drag", val)) %>%
unnest %>%
select(-key, -val)
Another:
df <- gather(df, key, value, 2:3)
df <- df[rep(seq_len(nrow(df)), df$value), 1:2]
df$key[df$key == "drag"] <- FALSE
df$key[df$key != "drag"] <- TRUE

One can use tidyr::expand to expand rows in desired format.
The solution using df used by #missuse can be shown as:
library(tidyverse)
df %>% gather(key,value,-names) %>%
mutate(key = ifelse(key=="drag", 1, 0)) %>%
group_by(names,key) %>%
expand(value = 1:value) %>%
select(names, value = key) %>%
as.data.frame()
# names value
# 1 A 0
# 2 A 0
# 3 A 1
# 4 A 1
# 5 A 1
# 6 A 1
# 7 A 1
# 8 A 1
# 9 A 1
# 10 A 1
# ...so on
# 117 E 1
# 118 E 1
# 119 E 1
# 120 E 1
# 121 E 1
# 122 E 1

Related

calculate frequency of unique values per group in R

How can I count the number of unique values such that I go from:
organisation <- c("A","A","A","A","B","B","B","B","C","C","C","C","D","D","D","D")
variable <- c("0","0","1","2","0","0","1","1","0","0","1","1","0","0","2","2")
df <- data.frame(organisation,variable)
organisation | variable
A | 0
A | 1
A | 2
A | 2
B | 0
B | 0
B | 1
B | 1
C | 0
C | 0
C | 1
C | 1
D | 0
D | 2
D | 2
D | 2
To:
unique_values | frequency
0,1,2 | 1
0,1 | 2
0,2 | 1
There are only 3 possible sequences:
0,1,2
0,1
0,2
Try this
s <- aggregate(. ~ organisation , data = df , \(x) names(table(x)))
s$variable <- sapply(s$variable , \(x) paste0(x , collapse = ","))
setNames(aggregate(. ~ variable , data = s , length) , c("unique_values" , "frequency"))
output
unique_values frequency
1 0,1 2
2 0,1,2 1
3 0,2 1
You can do something simple like this:
library(dplyr)
library(stringr)
distinct(df) %>%
arrange(variable) %>%
group_by(organisation) %>%
summarize(unique_values = str_c(variable,collapse = ",")) %>%
count(unique_values)
Output:
unique_values n
<chr> <int>
1 0,1 2
2 0,1,2 1
3 0,2 1

How to One-Hot Encoding stacked columns in R

I have data that look like this
+---+-------+
| | col1 |
+---+-------+
| 1 | A |
| 2 | A,B |
| 3 | B,C |
| 4 | B |
| 5 | A,B,C |
+---+-------+
Expected Output
+---+-----------+
| | A | B | C |
+---+-----------+
|1 | 1 | 0 | 0 |
|2 | 1 | 1 | 0 |
|3 | 0 | 1 | 1 |
|4 | 0 | 1 | 0 |
|5 | 1 | 1 | 1 |
+---+---+---+---+
How can I encode it like this?
Maybe this could help
df %>%
mutate(r = 1:n()) %>%
unnest(col1) %>%
table() %>%
t()
which gives
col1
r A B C
1 1 0 0
2 1 1 0
3 0 1 1
4 0 1 0
5 1 1 1
Data
df <- tibble(
col1 = list(
"A",
c("A", "B"),
c("B", "C"),
"B",
c("A", "B", "C")
)
)
If your data is given in the following format
df <- data.frame(
col1 = c("A", "A,B", "B,C", "B", "A,B,C")
)
then you can try
with(
df,
table(rev(stack(setNames(strsplit(col1, ","), seq_along(col1)))))
)
which gives
values
ind A B C
1 1 0 0
2 1 1 0
3 0 1 1
4 0 1 0
5 1 1 1
You could use table() with map_df() from purrr to count the occurrences
in each element of a list, and return a data frame. Putting it into a
function with some post-processing, and using dplyrs data frame unpacking in
mutate(), you could do something like this to stay within a data frame
context:
library(tidyverse)
one_hot <- function(x) {
map_df(x, table) %>%
mutate_all(as.integer) %>%
mutate_all(replace_na, 0L)
}
df <- data.frame(col1 = c("A", "A,B", "B,C", "B", "A,B,C"))
df %>%
mutate(
one_hot(strsplit(col1, ","))
)
#> col1 A B C
#> 1 A 1 0 0
#> 2 A,B 1 1 0
#> 3 B,C 0 1 1
#> 4 B 0 1 0
#> 5 A,B,C 1 1 1
An additional base R solution:
+(
with(
df,
sapply(
unique(
unlist(
strsplit(
col1,
","
)
)
),
`grepl`,
col1
)
)
)

How to calculate a variable using a lagged value using R

Suppose I have a simple dataset
df <- data.frame(id=c("A","B","C","D","E","F"),
value=c(1,NA,NA,NA,NA,NA))
I want to recode value (or create a new variable) so that each subsequent value is equal to the previous value * 2 + the previous value.
| id | value |
|----|-------|
| A | 1 |
| B | 3 |
| C | 9 |
| D | 27 |
| E | 81 |
| F | 243 |
I thought I could do this using lag:
df <- df %>%
mutate(value=(lag(value)*2)+lag(value))
But that didn't work. So instead I used a for loop
for (i in 2:nrow(df)){
df[I,"value"] <-(df[i-1,"value"]*2)+df[i-1,"value"]
}
That works but seems inelegant. Is there a better way to do this using tidyverse conventions/tools?
We can use accumulate from purrr
library(dplyr)
library(purrr)
df %>%
mutate(value = accumulate(value, ~ .x * 2 + .x))
# id value
#1 A 1
#2 B 3
#3 C 9
#4 D 27
#5 E 81
#6 F 243
Or more compact
df %>%
mutate(value = accumulate(value, ~ .x* 3))
Or in base R with Reduce
Reduce(function(x, y) x * 2 + x, df$value, accumulate = TRUE)
#[1] 1 3 9 27 81 243
We can use accumulate from purrr :
library(dplyr)
df %>%
mutate(value = purrr::accumulate(value[-n()], ~.x * 2 + .x,
.init = first(value)))
# id value
#1 A 1
#2 B 3
#3 C 9
#4 D 27
#5 E 81
#6 F 243
Which can be done similarly in base R using Reduce
Reduce(function(x, y) x * 2 + x, df$value[-nrow(df)], init = df$value[1],
accumulate = TRUE)
#[1] 1 3 9 27 81 243

Random sampling only a subset of data in R

I have a dataset (N of 2794) of which I want to extract a subset, randomly reallocate the class and put it back into the dataframe.
Example
| Index | B | C | Class|
| 1 | 3 | 4 | Dog |
| 2 | 1 | 9 | Cat |
| 3 | 9 | 1 | Dog |
| 4 | 1 | 1 | Cat |
From the above example, I want to random take N number of observations from column 'Class' and mix them up so you get something like this..
| Index | B | C | Class|
| 1 | 3 | 4 | Cat | Re-sampled
| 2 | 1 | 9 | Dog | Re-sampled
| 3 | 9 | 1 | Dog |
| 4 | 1 | 1 | Dog | Re-sampled
This code randomly extracts rows and re samples them, but I don't want to extract the rows. I want to keep them in the dataframe.
sample(Class[sample(nrow(Class),N),])
Suppose df is your data frame:
df <- data.frame(index=1:4, B=c(3,1,9,1), C=c(4,9,1,1), Class=c("Dog", "Cat", "Dog", "Cat"))
Would this do what you want?
dfSamp <- sample(1:nrow(df), N)
df$Class[dfSamp] <- sample(df$Class[dfSamp])
I simulated the data frame and did an example:
df <- data.frame(
ID=1:4,
Class=c('Dog', 'Cat', 'Dog', 'Cat')
)
N <- 2
sample_ids <- sample(nrow(df), N)
df$Class[sample_ids] <- sample(df$Class, length(sample_ids))
Assuming Class is how you named your datafame, you could do this:
library(dplyr)
bind_rows(
Class %>%
mutate(origin = 'not_sampled'),
Class %>%
sample(100, replace = TRUE) %>%
mutate(origin = 'sampled'))
Sample 100 observations of the original dataframe and stack them to the bottom of it. I am also adding a column so that you know if the observation was sampled or present in the dataframe from the beginning.
What you're wanting to do is replace in-line some classes, but not others.
So, if we start with a data frame, df
set.seed(100)
df = data.frame(index = 1:100,
B = sample(1:10,100,replace = T),
C = sample(1:10,100,replace = T),
Class = sample(c('Cat','Dog','Bunny'),100,replace = T))
And you want to update 5 random rows, then we need to pick which rows to update and what new classes to put in those rows. By referencing unique(df$class) you don't weight the classes by their current occurrence. You could adjust this with the weight argument or remove unique to use occurrence as weight.
n_rows = 5
rows_to_update = sample(1:100,n_rows,replace = F)
new_classes = sample(unique(df$Class),n_rows,replace = T)
rows_to_update
#> [1] 85 65 94 60 48
new_classes
#> [1] "Bunny" "Dog" "Dog" "Dog" "Bunny"
We can inspect what the original data looked like
df[rows_to_update,]
#> index B C Class
#> 85 85 1 2 Dog
#> 65 65 5 1 Bunny
#> 94 94 5 10 Dog
#> 60 60 3 7 Bunny
#> 48 48 9 1 Cat
We can update this in place with a reference to the column and the rows to update.
df$Class[rows_to_update] = new_classes
df[rows_to_update,]
#> index B C Class
#> 85 85 1 2 Bunny
#> 65 65 5 1 Dog
#> 94 94 5 10 Dog
#> 60 60 3 7 Dog
#> 48 48 9 1 Bunny

Cross tabulate multiple response questions

I need to cross tabulate multiple responses (stored as a set of variables) by a grouping variable. My survey question is: "Which of the following fruits have you had?" The respondent from either geographical Area 1 or Area 2 is then given a list with "1. Orange, 2. Mango, ..." and the resulting data from the yes (1) or no (0) questions is:
set.seed(1)
df <- data.frame(area=rep(c('Area 1','Area 2'), each=6),
var_orange=sample(0:1, 12, T),
var_banana=sample(0:1, 12, T),
var_melon=sample(0:1, 12, T),
var_mango=sample(0:1, 12, T))
area var_orange var_banana var_melon var_mango
1 Area 1 0 1 0 1
2 Area 1 0 0 0 0
3 Area 1 1 1 0 1
4 Area 1 1 0 0 0
5 Area 1 0 1 1 1
6 Area 1 1 1 0 1
7 Area 2 1 0 0 1
8 Area 2 1 1 1 1
9 Area 2 1 1 0 1
10 Area 2 0 0 0 1
11 Area 2 0 1 1 0
12 Area 2 0 0 1 0
I would like to get an summary output like this generated in Stata:
| area
| Area 1 Area 2 | Total
------------+------------------------+-----------
var_orange | 50.00 50.00 | 50.00
var_banana | 66.67 50.00 | 58.33
var_melon | 16.67 50.00 | 33.33
var_mango | 66.67 66.67 | 66.67
------------+------------------------+-----------
Total | 200.00 216.67 | 208.33
I found a related post with a multfreqtable function which gives a one-way summary for my data:
multfreqtable = function(data, question.prefix) {
z = length(question.prefix)
temp = vector("list", z)
for (i in 1:z) {
a = grep(question.prefix[i], names(data))
b = sum(data[, a] != 0)
d = colSums(data[, a] )
e = sum(rowSums(data[,a]) !=0)
f = as.numeric(c(d, b))
temp[[i]] = data.frame(question = c(sub(question.prefix[i],
"", names(d)), "Total"),
freq = f,
percent_response = (f/b)*100,
percent_cases = round((f/e)*100, 2))
names(temp)[i] = question.prefix[i]
}
temp
}
multfreqtable(df, "var_")
$var_
question freq percent_response percent_cases
1 orange 6 24 54.55
2 banana 7 28 63.64
3 melon 4 16 36.36
4 mango 8 32 72.73
5 Total 25 100 227.27
But I am interested in a two-way summary.
I could use dplyr as suggested in a post and get:
df %>%
summarise(orange_pct=round(sum(var_orange,na.rm=TRUE)*100/n(),2),
banana_pct=round(sum(var_banana,na.rm=TRUE)*100/n(),2),
melon_pct=round(sum(var_melon,na.rm=TRUE)*100/n(),2),
mango_pct=round(sum(var_mango,na.rm=TRUE)*100/n(),2))
orange_pct banana_pct melon_pct mango_pct
1 50 58.33 33.33 66.67
But I need a neater table output with marginal column frequencies.
You could first calculate the values using dplyr, then put them in a table using e.g. knitr::kable.
library(dplyr)
library(knitr)
set.seed(1)
df <- data.frame(area = rep(c('Area 1','Area 2'), each = 6),
var_orange = sample(0:1, 12, T),
var_banana = sample(0:1, 12, T),
var_melon = sample(0:1, 12, T),
var_mango = sample(0:1, 12, T))
t1 <- df %>% group_by(area) %>% summarise_each(funs(mean))
t2 <- df %>% summarise_each(funs(mean))
kable(rbind(t1, t2))
And you would get:
|area | var_orange| var_banana| var_melon| var_mango|
|:------|----------:|----------:|---------:|---------:|
|Area 1 | 0.5| 0.6666667| 0.1666667| 0.6666667|
|Area 2 | 0.5| 0.5000000| 0.5000000| 0.6666667|
|NA | 0.5| 0.5833333| 0.3333333| 0.6666667|
To further polish the output to mimick that from Stata:
polished <- 100 * rbind(t1, t2) %>% # Use percentages
select(-area) %>% # Drop "area"
mutate(Total = rowSums(.[])) %>% # Add Total
as.matrix %>% t
kable(polished, digits = 2, col.names = c("Area 1", "Area 2", "Total"))
The end result would be:
| | Area 1| Area 2| Total|
|:----------|------:|------:|------:|
|var_orange | 50.00| 50.00| 50.00|
|var_banana | 66.67| 50.00| 58.33|
|var_melon | 16.67| 50.00| 33.33|
|var_mango | 66.67| 66.67| 66.67|
|Total | 200.00| 216.67| 208.33|
A different solution using aggregate is
T1 = aggregate(df[,2:5], list(df$area), sum)
rownames(T1) = T1[,1]
T1 = t(T1[,-1])
T1 = addmargins(T1, 1:2, FUN = c(Total = sum), quiet=TRUE)
T1
Area 1 Area 2 Total
var_orange 3 3 6
var_banana 4 3 7
var_melon 1 3 4
var_mango 4 4 8
Total 12 13 25
Thanks to #rawr for suggesting the simplification of using addmargins.
If you want the table expressed as percentages instead of counts, simply divide by the total count to get the fraction and then change to a percentage.
T1 = aggregate(df[,2:5], list(df$area), sum)
rownames(T1) = T1[,1]
T1 = t(T1[,-1])
T1 = T1 * 100 / sum(T1)
T1 = addmargins(T1, FUN = c(Total = sum), quiet=TRUE)
T1
Area 1 Area 2 Total
var_orange 12 12 24
var_banana 16 12 28
var_melon 4 12 16
var_mango 16 16 32
Total 48 52 100

Resources