Non-normal sampling for card shuffling - r

I have a script that uses sample() to shuffle a pack of cards into four hands - Shown below.
This script produces normally distributed hand splits. That is the distributions of the 13 cards in a suit across the four hands will be "4432" 21.6% of the time, "5442" 15.5% of the time, ... - See here: https://en.wikipedia.org/wiki/Contract_bridge_probabilities#Hand%20pattern%20probabilities.
Is there a way to make sample() give non-normal distributions, e.g. Flatten the profile, so that the distributions across the hands deviate from the expected probabilities? sample() can take a weighting parameter, but can't see how to use this to achieve my goals. Alternatively, is there another sampling function that could provide this?
Thank you,
TC
# Set up
library(tidyverse)
set.seed(123)
# Build pack
pack <- expand.grid(rank = c("A", 2:9, "T", "J", "Q", "K"), suit = c("S", "H", "D", "C")) %>%
as_tibble(.name_repair = "minimal") %>%
mutate(card = paste(suit, rank, sep = "-"))
# Divide cards into hands
for (i in 1:4) {
temp <- sample(pack$card, 13, replace = FALSE) %>%
as_tibble(.name_repair = "minimal") %>%
separate(value, sep = "-", into = c("suit", "rank")) %>%
mutate(
suit = factor(suit, levels = c("S", "H", "D", "C")),
rank = factor(rank, levels = c("A", "K", "Q", "J", "T", 9:2, " "))
) %>%
arrange(suit, rank) %>%
unite("card", sep = "-")
assign(glue::glue("hand{i}"), temp)
pack <- pack %>%
filter(!card %in% unname(unlist(temp)))
}
# Reassemble pack
pack <- hand1 %>%
cbind(hand2) %>%
cbind(hand3) %>%
cbind(hand4) %>%
rename(N = 1, E = 2, S = 3, W = 4)

You can simplify the process of creating the deck, shuffling, and dealing as follows:
# Create the deck
Suit <- c("S", "H", "D", "C")
Rank <- c("A", 2:9, "T", "J", "Q", "K")
Deck <- data.frame(Rank=rep(Rank, 4), Suit=rep(Suit, each=13))
# Shuffle and deal
Shuffle <- Deck[sample(nrow(Deck)), ] # Shuffle the deck
Hand <- factor(rep(c("N", "E", "S", "W"), 13), levels=c("N", "E", "S", "W"))
Deal <- data.frame(Hand, Shuffle)
Now deal is a data frame showing the cards in each hand. To get the distribution of cards by suit:
xtabs(~Suit+Hand, Even)
# Hand
# Suit N E S W
# C 3 3 3 4
# D 3 3 4 3
# H 3 4 3 3
# S 4 3 3 3
To get the hands:
split(Deal, Deal$Hand)
# $N
# Hand Rank Suit
# 12 N Q S
# 2 N 2 S
# 37 N J D
# 49 N T C
# 43 N 4 C
# 17 N 4 H
# 33 N 7 D
# 26 N K H
# 13 N K S
# 44 N 5 C
# 18 N 5 H
# 46 N 7 C
# 11 N J S
. . . . .
Since you mention Hand Pattern Probabilities, it may be helpful to show how you can generate them empirically:
HPP <- function() {
Shuffle <- Deck[sample(nrow(Deck)), ] # Shuffle the deck
Hand <- factor(rep(c("N", "E", "S", "W"), 13), levels=c("N", "E", "S", "W"))
Deal <- data.frame(Hand, Shuffle)
tbl <- xtabs(~Suit+Hand, Deal)
return(unname(apply(tbl, 1, function(x) paste(sort(x, decreasing=TRUE), collapse="-"))))
}
X <- replicate(1000, HPP())
HP <- prop.table(sort(table(X), decreasing=TRUE))
as.matrix(HP)
# [,1]
# 4-4-3-2 0.22200
# 5-3-3-2 0.14925
# 5-4-3-1 0.12075
# 4-3-3-3 0.11225
# 5-4-2-2 0.10775
# 6-3-2-2 0.05175
# 6-4-2-1 0.04850
# . . . .
To get more "even" distributions of cards by suit, you have to break the random distribution of the shuffle, for example we can shuffle the rank values in the deck but preserve the order of the suits:
Even <- Deck
Even$Rank <- c(replicate(4, sample(Rank)))
Even <- data.frame(Hand, Even)
Now the card values will differ, but the distribution by suit will always be the same.

Related

How to construct panel data data frame in R from individual variables efficiently

I would like to construct panel data in R. I know how to do do it in principle but not how to do it efficiently.
My desired outcome would look like:
state city
delta A
delta B
delta C
gamma D
gamma E
...
omega X
I have a variables for individual states that contain cities:
delta <- c("A", "B", "C")
gamma <- c("F", "E)
...
I know that I could simply use:
state <- c("delta", "delta", "delta", "gamma" ... )
city <- c("A", "B", "C", "D" ...)
df <- data.frame(state, city)
but given that I have very large number of sates and cities using the above would be very time consuming.
Is there any more efficient way in which the variables delta, gamma .... can be somehow combined directly into dataframe?
You can use stack -
delta <- c("A", "B", "C")
gamma <- c("F", "E")
stack(dplyr::lst(delta, gamma))
# values ind
#1 A delta
#2 B delta
#3 C delta
#4 F gamma
#5 E gamma
Collect all object names into a list lst and proceed as follows. I emptied my current folder and then proceeded.
delta <- c("A", "B", "C")
gamma <- c("F", "E")
lst <- ls()
library(tidyverse)
map_df(lst, ~get(.x) %>% as.data.frame() %>% setNames('City') %>%
mutate(State = .x))
#> City State
#> 1 A delta
#> 2 B delta
#> 3 C delta
#> 4 F gamma
#> 5 E gamma
Created on 2021-05-22 by the reprex package (v2.0.0)
We can use mget with stack
stack(mget(c('delta', 'gamma')))

R - new variables from two subsets in data frame, random order in rows

I have a data frame containing two sets of variables: First, 30 columns containing 30 stimulus IDs, but in random order for each row. Then, the 30 response values relative to each stimulus. The first column of each block consist of a stimulus-response pair, the second column from each block are the second stimulus response pair etc., but the stimulus id itself varies.
I want to create new variables for each stimulus ID with the corresponding response.
I believe what I have is similar to the end-result of this question: Shuffle a data frame while maintaining order with another data frame
Example:
set.seed(3)
d <- data.frame( a = c("L", "G", "E", "E"),
b = c("G", "E", "L", "G"),
c = c("E", "L", "G", "L"),
e = rnorm(4), f = rnorm(4), g = rnorm(4))
d
# a b c e f g
# 1 L G E -1.1312186 -0.3076564 0.1998116
# 2 G E L -0.7163585 -0.9530173 -0.5784837
# 3 E L G 0.2526524 -0.6482428 -0.9423007
# 4 E G L 0.1520457 1.2243136 -0.2037282
Output I want:
d$L <- c(d[1, 4], d[2, 6], d[3, 5], d[4, 6])
d$E <- c(d[1, 6], d[2, 5], d[3, 4], d[4, 4])
d$G <- c(d[1, 5], d[2, 4], d[3, 6], d[4, 5])
d
# a b c e f g L E
# 1 L G E -1.1312186 -0.3076564 0.1998116 -1.1312186 0.1998116
# 2 G E L -0.7163585 -0.9530173 -0.5784837 -0.5784837 -0.9530173
# 3 E L G 0.2526524 -0.6482428 -0.9423007 -0.6482428 0.2526524
# 4 E G L 0.1520457 1.2243136 -0.2037282 -0.9423007 -1.1312186
I have two problems:
populating the new stimulus variable
repeating this for each stimulus
for 1., I tried nested ifelse statements
d$L <- ifelse(d$a == "L", d$e,
ifelse(d$b=="L", d$f,
ifelse(d$c=="L", d$g, NA)))
but the last ifelse overrides the first two. I tried a dplyr::mutate but can't figure out how to have one single ifelse statement, and with case_when got stuck on how to reference the correct column in the second set containing the response, and not defaulting to the first response column.
For 2.: I think I am supposed to use mapply with the two subsets divided in two separate matrices, but as far as I know, I then need a function-based solution for my first problem.
One option is to create a row/column index to extract the values from columns 4:6 and assign it to three new columns in the dataset
un1 <- unique(unlist(d[1:3]))
d[un1] <- lapply(un1, function(x)
d[4:6][cbind(seq_len(nrow(d)), max.col(d[1:3] == x, "first"))])
data
d <- structure(list(a = c("L", "G", "E", "E"), b = c("G", "E", "L",
"G"), c = c("E", "L", "G", "L"), e = c(-1.1312186, -0.7163585,
0.2526524, 0.1520457), f = c(-0.3076564, -0.9530173, -0.6482428,
1.2243136), g = c(0.1998116, -0.5784837, -0.9423007, -0.2037282
)), class = "data.frame", row.names = c("1", "2", "3", "4"))

Is there an R function to get the unique edges in an undirected (not directed) network?

I want to count the number of the unique edges in an undirected network, e.g, net
x y
1 A B
2 B A
3 A B
There should be only one unique edge for this matrix, because edges A-B and B-A are same for the undirected network.
For the directed network I can get the number of unique edges by:
nrow(unique(net[,c("x","y"]))
But this doesn't work for the undirected network.
Given that you are working with networks, an igraph solution:
library(igraph)
as_data_frame(simplify(graph_from_data_frame(dat, directed=FALSE)))
Then use nrow
Explanantion
dat %>%
graph_from_data_frame(., directed=FALSE) %>% # convert to undirected graph
simplify %>% # remove loops / multiple edges
as_data_frame # return remaining edges
Try this,
df <- data.frame(x=c("A", "B", "A"), y = c("B", "A", "B"))
unique(apply(df, 1, function(x) paste(sort(unlist(strsplit(x, " "))),collapse = " ")))
[1] "A B"
So how does this work?
We are applying a function to each row of the data frame, so we can take each row at a time.
Take the second row of the df,
df[2,]
x y
1 B A
We then split (strsplit) this, and unlist into a vector of each letter, (We use as.matrix to isolate the elements)
unlist(strsplit(as.matrix(df[2,]), " "))
[1] "B" "A"
Use the sort function to put into alphabetical order, then paste them back together,
paste(sort(unlist(strsplit(as.matrix(df[2,]), " "))), collapse = " ")
[1] "A B"
Then the apply function does this for all the rows, as we set the index to 1, then use the unique function to identify unique edges.
Extension
This can be extended to n variables, for example n=3,
df <- data.frame(x=c("A", "B", "A"), y = c("B", "A", "B"), z = c("C", "D", "D"))
unique(apply(df, 1, function(x) paste(sort(unlist(strsplit(x, " "))),collapse = " ")))
[1] "A B C" "A B D"
If more letters are needed, just combine two letters like the following,
df <- data.frame(x=c("A", "BC", "A"), y = c("B", "A", "BC"))
df
x y
1 A B
2 BC A
3 A BC
unique(apply(df, 1, function(x) paste(sort(unlist(strsplit(x, " "))),collapse = " ")))
[1] "A B" "A BC"
Old version
Using the tidyverse package, create a function called rev that can order our edges, then use mutate to create a new column combining the x and y columns, in such a way it works well with the rev function, then run the new column through the function and find the unique pairs.
library(tidyverse)
rev <- function(x){
unname(sapply(x, function(x) {
paste(sort(trimws(strsplit(x[1], ',')[[1]])), collapse=',')} ))
}
df <- data.frame(x=c("A", "B", "A"), y = c("B", "A", "B"))
rows <- df %>%
mutate(both = c(paste(x, y, sep = ", ")))
unique(rev(rows$both))
Here is a solution without the intervention of igraph, all inside one pipe:
df = tibble(x=c("A", "B", "A"), y = c("B", "A", "B"))
It is possible to use group_by() and then sort() combinations of values and paste() them in the new column via mutate(). unique() is utilized if you have "true" duplicates (A-B, A-B will get into one group).
df %>%
group_by(x, y) %>%
mutate(edge_id = paste(sort(unique(c(x,y))), collapse=" "))
When you have properly sorted edge names in a new column, it's quite straightforward to count unique values or filter duplicates out of your data frame.
If you have additional variables for edges, just add them into grouping.
If you're not using{igraph} or just want know how to do it cleanly without any dependencies...
Here's your data...
your_edge_list <- data.frame(x = c("A", "B", "A"),
y = c("B", "A", "B"),
stringsAsFactors = FALSE)
your_edge_list
#> x y
#> 1 A B
#> 2 B A
#> 3 A B
and here's a step-by-step breakdown...
`%>%` <- magrittr::`%>%`
your_edge_list %>%
apply(1L, sort) %>% # sort dyads
t() %>% # transpose resulting matrix to get the original shape back
unique() %>% # get the unique rows
as.data.frame() %>% # back to data frame
setNames(names(your_edge_list)) # reset column names
#> x y
#> 1 A B
If we drop the pipes, the core of it looks like this...
unique(t(apply(your_edge_list, 1, sort)))
#> [,1] [,2]
#> [1,] "A" "B"
And we can wrap it up in a function that 1) handles both directed and undirected, 2) handles data frames and (the more common) matrices, and 3) can drop loops...
simplify_edgelist <- function(el, directed = TRUE, drop_loops = TRUE) {
stopifnot(ncol(el) == 2)
if (drop_loops) {
el <- el[el[, 1] != el[, 2], ]
}
if (directed) {
out <- unique(el)
} else {
out <- unique(t(apply(el, 1, sort)))
}
colnames(out) <- colnames(el)
if (is.data.frame(el)) {
as.data.frame(out, stringsAsFactors = FALSE)
} else {
out
}
}
el2 <- rbind(your_edge_list,
data.frame(x = c("C", "C"), y = c("C", "A"), stringsAsFactors = FALSE))
el2
#> x y
#> 1 A B
#> 2 B A
#> 3 A B
#> 4 C C
#> 5 C A
simplify_edgelist(el2, directed = FALSE)
#> x y
#> 1 A B
#> 5 A C

Pass a df and variable argument in a function in R

How can I write a function with a df and variable argument, evaluating both? I read several posts and blog-posts from r-bloggers and I think I have some problem with the lazy-evaluation, but now I'm terribly confused.
This is my function:
RAM_char_func <- function(dataset, char_var){
a <- dataset[ , c("id", char_var)]
b <- a[[id]][is.na(a[[char_var]]) %in% FALSE]
c <- a[a[[id]] %in% b , ]
c
}
I get this:
Warning
Unknown or uninitialised column: 'char_var'.
This should give me a table (c) with two columns and based on the char_var n-amount lines. while the code works outside the function, I cannot manage to get it working inside the function. I also tried the tidyverse-idea with select and filter, but that doesnt work, too.
I'm using R 3.5.1 with Mac OS X (High Sierra, 10.13.6) and R Studio (latest version).
dataframe
df <- data.frame(id = c(1:10),
var_10 = c(101:110),
var_25 = c("a", "b", NA, "c", NA, "d", NA, "e", "f", NA),stringsAsFactors = F)
df
Code Outside of the function is:
a <- df[ , c("id", "v_25")]
a
b <- a$id[is.na(a$v_25) %in% FALSE]
b
c <- a[a$id %in% b , ]
c
or
library(tidyverse)
df %>%
select(id, var_25) %>%
filter(is.na(var_25) %in% FALSE)
Here is one way to solve your issue:
library(tidyverse)
library(rlang)
remove.na <- function(data, col){
symcol <- enquo(col)
data %>% select(id, !!symcol) %>% filter(!is.na(!!symcol))
}
df %>% remove.na(var_25)
# id var_25
# 1 1 a
# 2 2 b
# 3 4 c
# 4 6 d
# 5 8 e
# 6 9 f
all_equal(df %>% select(id, var_25) %>% filter(!is.na(var_25)), df %>% remove.na(var_25))
[1] TRUE
In the spirit of "teach a man how to fish", you can use the reprex package to be sure you're running your buggy code in a clean environment, and the error will be very explicit (you can also just run the code in a new session as long as you don't have a messy Rprofile file) :
library(reprex)
reprex({
df <- data.frame(id = c(1:10),
var_10 = c(101:110),
var_25 = c("a", "b", NA, "c", NA, "d", NA, "e", "f", NA),stringsAsFactors = F)
RAM_char_func <- function(dataset, char_var){
a <- dataset[ , c("id", char_var)]
b <- a[[id]][is.na(a[[char_var]]) %in% FALSE]
c <- a[a[[id]] %in% b , ]
c
}
RAM_char_func(df,"var_25")
})
If you use Rstudio it will show the output in the Viewer tab, and the error that I get is :
> Error in (function(x, i, exact) if (is.matrix(i)) as.matrix(x)[[i]] else .subset2(x, : objet 'id' introuvable
(sorry for the french)
It tells you object id can't be found, then you can check why your code is trying to access an object named id and you'll find easily the mistake in the 2nd instruction of your function.
If finding the sinful line is hard for you, try running traceback() right after the error or call debugonce(RAM_char_func) then RAM_char_func(df,"var_25") and browse until you find the line which fails.
your function contained an error. [[id]] must be [["id"]]
RAM_char_func <- function(dataset, char_var){
a <- dataset[ , c("id", char_var)]
b <- a[["id"]][is.na(a[[char_var]]) %in% FALSE]
c <- a[a[["id"]] %in% b , ]
c
}
data:
df <- data.frame(id = c(1:10),
var_10 = c(101:110),
var_25 = c("a", "b", NA, "c", NA, "d", NA, "e", "f", NA),stringsAsFactors = F)
call your function:
RAM_char_func(dataset=df,char_var="var_25")
# id var_25
#1 1 a
#2 2 b
#4 4 c
#6 6 d
#8 8 e
#9 9 f

Create an Index of a combination of data.frame columns in R

this question is kind of related to this one, however i want to create an Index using a unique combination of two data.frame columns.
So my data structure looks for example like this (dput):
structure(list(avg = c(0.246985988921473, 0.481522354272779,
0.575400762275067, 0.14651009243539, 0.489308880181752, 0.523678968337178
), i_ID = c("H", "H", "C", "C", "H", "S"), j_ID = c("P", "P",
"P", "P", "P", "P")), .Names = c("avg", "i_ID", "j_ID"), row.names = 7:12, class = "data.frame")
The created Index for the above structure should therefore look like this
1
1
2
2
1
3
In the example data the column j_ID always has the value P, but this isn't always the case. Furthermore vice-versa (S-P or P-S) combinations should result in the same index.
Someone knows a nice way to accomplish that? I can do it with a lot of for-loops and if-else commands, but thats not really elegant.
The interaction function will work well:
foo = structure(list(avg = c(0.246985988921473, 0.481522354272779, 0.575400762275067, 0.14651009243539, 0.489308880181752, 0.523678968337178), i_ID = c("H", "H", "C", "C", "H", "S"), j_ID = c("P", "P", "P", "P", "P", "P")), .Names = c("avg", "i_ID", "j_ID"), row.names = 7:12, class = "data.frame")
foo$idx <- as.integer(interaction(foo$i_ID, foo$j_ID))
> foo
avg i_ID j_ID idx
7 0.2469860 H P 2
8 0.4815224 H P 2
9 0.5754008 C P 1
10 0.1465101 C P 1
11 0.4893089 H P 2
12 0.5236790 S P 3
Ah, I didn't read carefully enough. There is probably a more elegant solution, but you can use outer function and upper and lower triangles:
# lets assign some test values
x <- c('a', 'b', 'c')
foo$idx <- c('a b', 'b a', 'b c', 'c b', 'a a', 'b a')
mat <- outer(x, x, FUN = 'paste') # gives all possible combinations
uppr_ok <- mat[upper.tri(mat, diag=TRUE)]
mat_ok <- mat
mat_ok[lower.tri(mat)] <- mat[upper.tri(mat)]
Then you can match indexes found in mat with those found in mat_ok:
foo$idx <- mat_ok[match(foo$idx, mat)]
To add to Justin's answer, if you would like the indexes to preserve the order of the original of the i_ID, you can assign the interaction() results to a variable and then order the levels.
x <- interaction(foo$i_ID, foo$j_ID)
x <- factor(x, levels=levels(x)[order(unique(foo$i_ID))])
foo$idx <- as.integer(x)
which gives:
> foo
avg i_ID j_ID idx
7 0.2469860 H P 1
8 0.4815224 H P 1
9 0.5754008 C P 2
10 0.1465101 C P 2
11 0.4893089 H P 1
12 0.5236790 S P 3

Resources