Count how often two factors have the same output value - r

I want to calculate the number of times two individuals share the same group number. I'm working with quite a large dataset (169 individuals and over a 1000 observations (rows) of them) and I'm looking for an efficient way to count the occurrence of them being in the same group. My (simplified) data looks like this:
ID
Group number
Date
Time
Aa
1
15-06-22
15:05:22
Bd
1
15-06-22
15:05:27
Cr
2
15-06-22
15:07:12
Bd
1
15-06-22
17:33:15
Aa
2
15-06-22
17:36:54
Cr
2
15-06-22
17:37:01
...
I would like my output data to look like this:
Aa-Bd
Aa-Cr
Bd-Cr
...
1
1
0
Or:
Occurrence
Dyad
1
Aa-Bd; Aa-Cr
0
Bd-Cr
Or even a matrix might work. I've been trying to replicate the solution posed for this problem: Count occurrences of a variable having two given values corresponding to one value of another variable
but for some reason my matrix remains empty, even though I know that certain individuals have been in groups with others.
Any help and suggestions would be extremely appreciated! I feel like the solution shouldn't be too complicated but for some reason I can't seem to figure it out.
Thanks in advance!
Edit: some example data from dput():
dput(c[1:5,])
structure(list(Date = structure(c(19129, 19129, 19129, 19129,
19129), class = "Date"), Time = c("11:05:58", "11:06:06", "11:06:16",
"11:06:33", "11:06:59"), Data = structure(c(1L, 1L, 1L, 1L, 1L
), .Label = "Crossing", class = "factor"), Group = structure(c(5L,
5L, 5L, 5L, 5L), .Label = c("Ankhase", "Baie Dankie", "Kubu",
"Lemon Tree", "Noha"), class = "factor"), IDIndividual1 = structure(c(158L,
158L, 34L, 153L, 14L), .Label = c("Aaa", "Aal", "Aan", "Aapi",
"Aar", "Aara", "Aare", "Aat", "Amst", "App", "Asis", "Awa", "Beir",
"Bela", "Bet", "Buk", "Daa", "Dais", "Dazz", "Deli", "Dewe",
"Dian", "Digb", "Dix", "Dok", "Dore", "Eina", "Eis", "Enge",
"Fle", "Flu", "Fur", "Gale", "Gaya", "Gese", "Gha", "Ghid", "Gib",
"Gil", "Ginq", "Gobe", "Godu", "Goe", "Gom", "Gran", "Gree",
"Gri", "Gris", "Griv", "Guat", "Gub", "Guba", "Gubh", "Guz",
"Haai", "Hee", "Heer", "Heli", "Hond", "Kom", "Lail", "Lewe",
"Lif", "Lill", "Lizz", "Mara", "Mas", "Miel", "Misk", "Moes",
"Mom", "Mui", "Naal", "Nak", "Ncok", "Nda", "Ndaw", "Ndl", "Ndon",
"Ndum", "Nge", "Nko", "Nkos", "Non", "Nooi", "Numb", "Nurk",
"Nuu", "Obse", "Oerw", "Oke", "Ome", "Oort", "Ouli", "Oup", "Palm",
"Pann", "Papp", "Pie", "Piep", "Pix", "Pom", "Popp", "Prai",
"Prat", "Pret", "Prim", "Puol", "Raba", "Rafa", "Ram", "Rat",
"Rede", "Ree", "Reen", "Regi", "Ren", "Reno", "Rid", "Rim", "Rioj",
"Riss", "Riva", "Rivi", "Roc", "Sari", "Sey", "Sho", "Sig", "Sirk",
"Sitr", "Skem", "Sla", "Spe", "Summary", "Syl", "Tam", "Ted",
"Tev", "Udup", "Uls", "Umb", "Unk", "UnkAM", "UnkBB", "UnkJ",
"UnkJF", "UnkJM", "Upps", "Utic", "Utr", "Vla", "Vul", "Xala",
"Xar", "Xeni", "Xia", "Xian", "Xih", "Xin", "Xinp", "Xop", "Yam",
"Yamu", "Yara", "Yaz", "Yelo", "Yodo", "Yuko"), class = "factor"),
Behaviour = structure(c(2L, 3L, 1L, 1L, 1L), .Label = c("Crossing",
"First Approacher", "First Crosser", "Last Crosser", "Summary"
), class = "factor"), CrossingType = c("Road - Ground Level",
"Road - Ground Level", "Road - Ground Level", "Road - Ground Level",
"Road - Ground Level"), GPSS = c(-27.9999, -27.9999, -27.9999,
-27.9999, -27.9999), GPSE = c(31.20376, 31.20376, 31.20376,
31.20376, 31.20376), Context = structure(c(1L, 1L, 1L, 1L,
1L), .Label = c("Crossing", "Feeding", "Moving", "Unknown"
), class = "factor"), Observers = structure(c(12L, 12L, 12L,
12L, 12L), .Label = c("Christelle", "Christelle; Giulia",
"Christelle; Maria", "Elif; Giulia", "Josefien; Zach; Flavia; Maria",
"Mathieu", "Mathieu; Giulia", "Mike; Mila", "Mila", "Mila; Christelle; Giulia",
"Mila; Elif", "Mila; Giulia", "Nokubonga; Mila", "Nokubonga; Tam; Flavia",
"Nokubonga; Tam; Flavia; Maria", "Nokubonga; Zach; Flavia; Maria",
"Tam; Flavia", "Tam; Zach; Flavia; Maria", "Zach", "Zach; Elif; Giulia",
"Zach; Flavia; Maria", "Zach; Giulia"), class = "factor"),
DeviceId = structure(c(10L, 10L, 10L, 10L, 10L), .Label = c("{129F4050-2294-0D43-890F-3B2DEF58FC1A}",
"{1A678F44-DB8C-1245-8DD7-9C2F92F086CA}", "{1B249FD2-AA95-5745-9A32-56CDD0587018}",
"{2C7026A6-6EDC-BA4F-84EC-3DDADFFD4FD7}", "{2E489E9F-00BE-E342-8CAE-941618B2F0E6}",
"{359CEB57-351F-F54F-B2BD-77A05FB6C349}", "{3727647C-B73A-184B-B187-D6BF75646B84}",
"{7A4E6639-7387-7648-88EC-7FD27A0F258A}", "{854B02F2-5979-174A-AAE8-398C21664824}",
"{89B5C791-1F71-0149-A2F7-F05E0197F501}", "{D92DF19A-9021-A740-AD99-DCCE1D88E064}"
), class = "factor"), Obs.nr = c(1, 1, 1, 1, 1), Gp.nr = c(1,
3, 3, 4, 5)), row.names = c(NA, -5L), groups = structure(list(
Obs.nr = 1, .rows = structure(list(1:5), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = c(NA, -1L), class = c("tbl_df",
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"))
In here Gp.nr is my group number, IDIndividual1 is my ID.

This is not efficient at all, but as a starting point you can use (GN denotes the group number)
my_ID <- unique(df$ID)
matrix <- matrix(nrow = length(my_ID),ncol = length(my_ID))
for (i in 1:length(my_ID)){
for (j in 1:length(my_ID)){
matrix[i,j] <- length(intersect(df$GN[df$ID == my_ID[i]],df$GN[df$ID == my_ID[j]]))}}

Check this out:
## Creating the Dataframe
df = data.frame(ID = c("Aa","Bd","Cc","Dd","Cr"),
GroupNumber=c(1,2,1,3,3))
## Loading the libraries
library(dplyr)
library(tidyverse)
library(stringr)
## Grouping to find out which observations share same group
df1 = df %>%
group_by(GroupNumber) %>%
summarise(ID_=paste(ID, collapse="-"),
CountbyID = n_distinct(ID_)) %>%
filter(str_detect(ID_, "-"))
## Creating all possible pair combinations and then joining and concatenating all rows
df2 = data.frame(t(combn(df$ID,2))) %>%
mutate(Comb = paste(X1,"-",X2, sep = "")) %>%
left_join(df1, by=c("Comb"="ID_")) %>%
select(Comb, CountbyID) %>%
replace(is.na(.), 0) %>%
group_by(CountbyID) %>%
summarise(ID=paste(Comb, collapse=";"))
Hope this helps!
UPDATE
The way the dataframe is setup, its causing issues to the "IDIndividual1" column. Based on the way it is setup, it has more factor levels than the unique data points. Therefore, I simply converted it to a character. Try the code below:
df = df[,c("IDIndividual1","Gp.nr")]
colnames(df) = c("ID","GroupNumber")
df$ID = as.character(df$ID) ## Converting factors to characters
## Loading the libraries
library(dplyr)
library(tidyverse)
library(stringr)
## Grouping to find out which observations share same group
df1 = df %>%
group_by(GroupNumber) %>%
summarise(ID_=paste(ID, collapse="-"),
CountbyID = n_distinct(ID_)) %>%
filter(str_detect(ID_, "-"))
## Creating all possible pair combinations and then joining and concatenating all rows
df2 = data.frame(t(combn(df$ID,2))) %>%
distinct() %>%
filter(X1 != X2) %>%
mutate(Comb = paste(X1,"-",X2, sep = "")) %>%
left_join(df1, by=c("Comb"="ID_")) %>%
select(Comb, CountbyID) %>%
replace(is.na(.), 0) %>%
group_by(CountbyID) %>%
summarise(ID=paste(Comb, collapse=";"))

Related

Trying to subset a large table using counts of all row values in a single column

Working in R on genomic data.
I'm trying to subset a very large melted phyloseq table, which includes a column of phylum IDs, in order to remove rows containing phyla that occur less than 100000 times in the table. I might have missed an "easy" way to do this, but I eventually ended up trying to make my own function.
The function:
phylum_subset <- function(x = melt.ALKSS_few, #melted physeq object
Count = melt.ALKSS$Phylum, #counting phyla
Value = 1000 #minimum number of OTUs
){
phyla.table <- table(x$Count)
for(Count in x){if(phyla.table[Count]<=100000)
subset(x,Phylum != Count)
}
}
I will grant that this is my first time writing a function and I don't really know what I'm doing.
My function input and resulting error output ends up like so:
melt.ALKSS_few.count <- phylum_subset(x = melt.ALKSS_few,Count = melt.ALKSS_few$Phylum,Value = 100000)
Error in if (phyla.table[Count] <= 1e+05) subset(x, Count != Phylum_col) :
the condition has length > 1
Because I'm trying to subset by a sum of occurences in a column, across all occurences in that column, I couldn't just use filter() or something once (unless I wanted to do that 500 times). Surely someone has done something like this before?
Edit: OK, trying to provide a reproducible chunk of my dataset. Be warned, it's got over 808k obs of 47 variables because doing genomics on an ecological dataset is a mess. I've removed some variables that are remnants of metadata for previous steps (primer sequences, etc.) that I won't be using in analysis just to keep the code block... less massive.
> dput(droplevels(head(melt.ALKSS_few)))
structure(list(OTU = c("44c21e29adae97a53247abbd73978395", "0f18144d308ada95632ab5193d92073f",
"d829bee4984f82ffc2453212157caf96", "0f18144d308ada95632ab5193d92073f",
"0ddcd311e02f742e2e0e61ce02cf9c29", "120eba657e42a11a5c29f97b90f02035"
), Sample = c("S438", "S680", "S437", "S345", "S454", "S513"),
Abundance = c(10755, 9568, 8186, 7621, 7506, 7501), BarcodeSequence = c("CATTTTAGGACT",
"CGGAATAGAGTA", "CATTTTAGAGTA", "TATAATGGACCA", "CGGAATTGGCAT",
"GACGACGGACCA"), PrimerDesc = c("16S",
"16S", "16S", "16S", "16S", "16S"), SampleName = c("06222021KC-2-R",
"09292021KC-2-R", "06222021KC-1-R", "06032021KC-1-R", "06292921KC-3-R",
"06302021KC-3-R"), Project = c("16SLBSKR1-", "16SLBSKR2-",
"16SLBSKR1-", "16SLBSKR1-", "16SLBSKR1-", "16SLBSKR2-"),
Number = c("456", "694", "455", "363", "471", "491"), Date = c("6_22_2021", "9_29_2021", "6_22_2021", "6_3_2021",
"6_29_2021", "6_30_2021"), Year = c(2021L, 2021L, 2021L,
2021L, 2021L, 2021L), Season = c("Summer", "Fall", "Summer",
"Summer", "Summer", "Summer"), sample_Species = c("Little_Bluestem",
"Little_Bluestem", "Little_Bluestem", "Little_Bluestem",
"Little_Bluestem", "Little_Bluestem"), SoloOrMixed = c("Solo",
"Mixed", "Solo", "Mixed", "Mixed", "Solo"), Location = c("Tyler_SP", "Hy_180", "Tyler_SP", "Roadside_Hy67",
"Copper_Breaks_SP", "Caprock_Canyons_SP"), Ecoregion = c("South_Central_Plains",
"South_Central_Plains", "South_Central_Plains", "Edwards_Plateau",
"Southwestern_Tablelands", "Southwestern_Tablelands"), Habitat = c("Forest",
"Roadside", "Forest", "Roadside", "Roadside", "AridRock"),
Source = c("Root", "Root", "Root", "Root", "Root", "Root"
), PrecipMonth = c(96.65,
37.45, 96.65, 125.94, 125.01, 153.94), PrecipDaysSince = c(1L,
1L, 1L, 1L, 1L, 0L), pH = c(6.8, 6.7, 6.8, 8, 8, 7.8), EC = c(139L,
182L, 139L, 161L, 125L, 2370L), NO3 = c(0, 4.4, 0, 0.2, 2.2,
3.4), P = c(16L, 17L, 16L, 14L, 5L, 6L), K = c(145L, 84L,
145L, 114L, 160L, 65L), Ca = c(3918L, 2159L, 3918L, 27256L,
6609L, 16508L), Mg = c(166L, 130L, 166L, 188L, 148L, 95L),
S = c(10L, 16L, 10L, 24L, 24L, 14299L), Na = c(4L, 3L, 4L,
4L, 4L, 4L), Fe = c(19.76, 17, 19.76, 2.31, 1, 0), Zn = c(2.28,
15.1, 2.28, 7.01, 0.8, 0.1), Mn = c(64.16, 19, 64.16, 27.01,
15, 6), Cu = c(0.16, 0.2, 0.16, 0.16, 0.2, 0.2), Kingdom = c("d__Bacteria",
"d__Bacteria", "d__Bacteria", "d__Bacteria", "d__Bacteria",
"d__Bacteria"), Phylum = c("Proteobacteria", "Proteobacteria",
"Proteobacteria", "Proteobacteria", "Proteobacteria", "Actinobacteriota"
), Class = c("Gammaproteobacteria", "Gammaproteobacteria",
"Alphaproteobacteria", "Gammaproteobacteria", "Gammaproteobacteria",
"Actinobacteria"), Order = c("Xanthomonadales", "Pseudomonadales",
"Rhizobiales", "Pseudomonadales", "Pseudomonadales", "Streptomycetales"
), Family = c("Rhodanobacteraceae", "Pseudomonadaceae", "Xanthobacteraceae",
"Pseudomonadaceae", "Pseudomonadaceae", "Streptomycetaceae"
), Genus = c("Rhodanobacter", "Pseudomonas", "Bradyrhizobium",
"Pseudomonas", "Pseudomonas", "Streptomyces"), Species = c(NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_)), row.names = c(2352002L, 511171L, 7348565L,
510815L, 468295L, 621043L), class = "data.frame")
See below for a tidyverse solution. Note that I've used GlobalPatterns from phyloseq to create a reproducible example.
require("phyloseq")
require("tidyverse")
# Load the data and melt it
data(GlobalPatterns)
psdf <- psmelt(GlobalPatterns)
# Function to subset a dataframe based on the size of each group
# in a grouping variable
subset_by_freq <- function(df, grouping_var, threshold){
df %>%
group_by(!!sym(grouping_var)) %>%
filter(n() >= threshold) %>%
ungroup()
}
# Filter out taxa with less than 1e5 counts
psdf_sub <- subset_by_freq(psdf, "Phylum", 1e5)
# Sanity check: count the number of rows per taxon
psdf_sub %>%
group_by(Phylum) %>%
tally()
#> # A tibble: 2 x 2
#> Phylum n
#> <chr> <int>
#> 1 Firmicutes 113256
#> 2 Proteobacteria 166816
Created on 2022-08-12 by the reprex package (v2.0.1)

r loop/function to find matches from a list

I have a list of sales people in three columns and I want to go down a list I have and:
a) Where their name appears in any of the three column
b) their name appears with a Trainee Sales person (these would be people whose name is not in the list)
ilist <- c("SP1","SP2","SP3","SP4","SP5")
df2 <-
data.frame(sales1 = c("SP5","SP5","SP4","SP3","SP2","SP1","SP3"),
sales2 = c("","SP4","SP1","SP1","SP5","SP3",""),
sales3 = c("","SP9","","SP6","","",""))
Output I'd expect something like below answer(though i'd take any output):
A B
SP1 3 1
SP2 1 0
SP3 3 1
SP4 1 1
SP5 3 1
I've tried creating a loop and a function but i cant seem to get them to work.
The aim after getting this to work is to make it part of a group_by so I can break it down by type and year
data %>%
group_by(type,year) %>%
your helpful answer here
edit:
select of the columns I'm looking at using.
My iList would be something like the below
(in the 3 Columns column 2 and 3 will contain blanks where the sales person only appears in column 1; there is also no set placement for where a sales person or trainee may appear)
ilist <- c("SJ","KW","MOLC","FERB","BACC")
structure(list(iYear = structure(c(1L, 4L, 3L, 4L, 4L,
4L, 5L, 5L, 6L, 9L), .Label = c("2020-07-01", "2020-07-02", "2020-07-03",
"2020-07-04", "2020-07-06", "2020-07-07", "2020-07-08", "2020-07-09",
"2020-07-10", "2020-07-11", "2020-07-12", "2020-07-13", "2020-07-14",
"2020-07-15", "2020-07-16", "2020-07-17", "2020-07-18", "2020-07-19",
"2020-07-20", "2020-07-21", "2020-07-22", "2020-07-23", "2020-07-24",
"2020-07-25", "2020-07-27", "2020-07-28", "2020-07-29", "2020-07-30",
"2020-07-31"), class = "factor"), iType = structure(c(4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L), .Label = c("", "ZB", "BS",
"CFN", "CTR", "MJ", "UK", "EFH", "ENOC", "EY", "F", "G", "CD",
"HAEM", "HN", "IC", "LB", "LY", "MNN", "MOS", "NERO", "ZZZ",
"ZZZQE", "GFT", "PG", "RE", "SK", "UR"), class = "factor"),
Sales.1 = structure(c(74L, 20L, 74L, 16L,
3L, 3L, 3L, 16L, 58L, 41L), .Label = c("", "ABUE", "AHMEM",
"AJOS", "ANNS", "AOK", "BACC", "BH", "BLAFM", "BLOCA", "BRAD",
"BROWNJ", "BRT", "BUIH", "BURDA", "BURYA", "CANRJ", "CAVM",
"CHAMBA", "COOSNP", "COUPSI", "CPH", "CTT", "DARA", "DILP",
"EXPAT", "FCH", "FERB", "FERMA", "GT", "GT", "HAEM", "HAMJR",
"HENJ", "HENJA", "HOWRA", "HUSA", "ILINC", "JONG", "KC",
"KNOT", "KW", "LAUC", "LOOP", "LYEJO", "LYNN", "MAJJ", "MCGREA",
"MENT", "MKB", "MOLC", "MUDHS", "MULLM", "NC", "NODS",
"O'BSG", "OLIT", "OLIVK", "PAEI", "PARKD", "PATEF", "PERT",
"POL", "PTRHUS", "RAMACN", "RAMS", "REYMA", "ROBCM", "ROBINE",
"SAMJN", "SAYC", "SHARMM", "SHEG", "SJ", "SJN", "SKINT",
"SLOP", "SORT", "SOUBIO", "SPOE", "TELED", "THAN", "THEL",
"TURH", "TURHJ", "UCONS", "UPH", "UT", "VALK", "WALJ"
), class = "factor"), Sales.2 = structure(c(1L,
12L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 45L), .Label = c("", "ABUE",
"AHMEM", "AJOS", "AOK", "BACC", "BH", "BLAFM", "BROWNJ",
"BUIH", "BURDA", "BURYA", "CANRJ", "CAVM", "CHAMBA", "COOSNP",
"COUPSI", "DARA", "DILP", "FCH", "FERB", "FERMA", "GYNT",
"HOWRA", "HUSA", "ILINC", "KW", "LAUC", "LOOP", "LYNN", "MAJJ",
"MOLC", "MULLM", "NC", "OLIVK", "PARKD", "POL", "PTRHUS",
"RAMS", "REYMA", "ROBCM", "ROBINE", "SAMJN", "SHARMM", "SJ",
"SJN", "SKINT", "SLOP", "SORT", "SPOE", "TELED", "THAN",
"THEL", "TURH", "VALK"), class = "factor"), Sales.3 = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("", "AHMEM",
"AOK", "BACC", "BLAFM", "CHAMBA", "COUPSI", "DILP", "FCH",
"KW", "LOOP", "MAJJ", "PTRHUS", "RAMS", "ROBCM", "SAMJN",
"SHARMM", "SJ", "TELED", "THAN", "VALK"), class = "factor")), row.names = c(NA,
10L), class = "data.frame")
I'm not sure this is what you're looking for, but thought it might be helpful. With your interest in using group_by sounds like you might want a tidyverse approach.
Here, would add row numbers, so you can group_by each row to see if a trainee is present with a sales person in the same row.
Then, use pivot_longer to put into a long format, and remove the empty strings.
When grouping by the row number, you can add an indicator that those people will have appeared with a trainee sales person. It looks to see if the person is not contained in the ilist.
Finally, you can group_by each sales person, only include those in ilist with filter, and add up the number of times appeared (assumes only once per row in the initial data), and number of trainee contacts.
library(tidyverse)
df2 %>%
mutate(rn = row_number()) %>%
pivot_longer(cols = -rn) %>%
na_if("") %>%
na.omit %>%
group_by(rn) %>%
mutate(with_trainee = ifelse(any(!value %in% ilist), 1, 0)) %>%
group_by(value) %>%
filter(value %in% ilist) %>%
summarise(A = n(),
B = sum(with_trainee))
Output
value A B
<chr> <int> <dbl>
1 SP1 3 1
2 SP2 1 0
3 SP3 3 1
4 SP4 2 1
5 SP5 3 1
Edit 1: With your "live data", and grouping results by year from iYear and iType, you can try this:
library(tidyverse)
df2 %>%
mutate(rn = row_number(),
iYear = substr(iYear, 1, 4)) %>%
pivot_longer(cols = -c(rn, iYear, iType)) %>%
na_if("") %>%
na.omit %>%
group_by(rn, iYear, iType) %>%
mutate(with_trainee = ifelse(any(!value %in% ilist), 1, 0)) %>%
group_by(value, iYear, iType) %>%
filter(value %in% ilist) %>%
summarise(A = n(),
B = sum(with_trainee))
Edit 2: Additional detailed explanation:
Row numbers (rn via row_number) is helpful in this case, as you want to know if sales people are present at the same time (which implies "within the same row"). So, if 2 sales people share the same rn, they were present at the same time.
iYear is changed to just a year. It uses substr() (substring) to take the 1st through 4th character of iYear which in XXXX-XX-XX date format is the year.
pivot_longer (and its friend, pivot_wider) are really powerful to convert from long <-> wide formats of data. In the tidyr package, pivot_longer takes all the columns (except for rn, iYear, and iType) and puts into two columns (name and value). value contains the salesperson in a single column now, instead of multiple columns it started with.
na_if("") will make the blank strings "" become NA (missing data). The follow up na.omit will remove those rows with NA then.
The group_by with rn makes sure you are looking collectively at those sales people who share the same rn. I added iYear and iType so that they will also show up in the final summarized results. Then, with_trainee is a new column that will contain whether that sales person is with a trainee or not (after group_by using any to see if "any row" within the group, sharing the same rn, is in the ilist vector). If there is, then code as 1, if not, then code as 0.
The next group_by is by value (or sales person), using filter since you will only want results on those who are in ilist. (If you wanted everyone, including trainees not in ilist, you could leave this line out.)
The final summarise works with the group_by - with n() showing the number of rows of data per value (or per sales person), which is the same as number of different rn values a sales person may appear in overall. The sum(with_trainee) is the total number of times the with_trainee was 1 for a given value (or sales person).
Output
value iYear iType A B
<fct> <chr> <fct> <int> <dbl>
1 SJ 2020 CFN 3 1
I don't really understand the expected result to be honest since you say you expect SP2 | 1 | 0 but SP2 did not appear in line 1. the following might do what you want... or not.
library(data.table)
sales <- data.table(sale = c("SP1", "SP2", "SP3", "SP4", "SP5"))
sales_group <-
data.table(
sales1 = c("SP5", "SP5", "SP4", "SP3", "SP2", "SP1", "SP3"),
sales2 = c("", "SP4", "SP1", "SP1", "SP5", "SP3", ""),
sales3 = c("", "SP9", "", "SP6", "", "", "")
)
all <- sort(sales_group[, unique(c(sales1, sales2, sales3))])
all <- all[all != ""]
trainees <- all[!all %in% c(sales$sale, "")]
sales_group[, pos := seq(.N)]
sales1 <- merge(sales, sales_group, by.x = "sale", by.y = "sales1")
sales2 <- merge(sales, sales_group, by.x = "sale", by.y = "sales2")
sales3 <- merge(sales, sales_group, by.x = "sale", by.y = "sales3")
setnames(sales1, c("sale", "plusone", "plustwo", "sales_pos"))
setnames(sales2, c("sale", "plusone", "plustwo", "sales_pos"))
setnames(sales3, c("sale", "plusone", "plustwo", "sales_pos"))
sales_visit_by_sale <- rbind(sales1, sales2, sales3)
sales_visit_by_sale[, with_trainee := FALSE]
sales_visit_by_sale[(plusone %in% trainees) | (plustwo %in% trainees), with_trainee := TRUE]
sales_visit_by_sale[(order(sale, sales_pos)), .(sale, sales_pos, with_trainee)]

Create a contingency table with 2 factors from messy data

I have the following data in messy format:
structure(list(com_level = c("B", "B", "B", "B", "A", "A"),
hf_com = c(1, 1, 1, 1, 1, 1),
sal_level = c("2", "3", "1", "2", "1", "4"),
exp_sal = c(NA, 1, 1, NA, 1, NA)),
class = c("tbl_df", "tbl", "data.frame"),
row.names = c(NA, -6L))
Column com_level is the factor with 2 levels and column hf_com gives the frequency count for that level.
Column sal_level is the factor with 4 levels and column exp_sal gives the frequency count for that level.
I want to create a contingency table similar to this:
structure(list(`1` = c(1L, 2L),
`2` = c(0L, 1L),
`3` = c(0L, 2L),
`4` = c(1L, 0L)),
row.names = c("A", "B"), class = "data.frame")
I have code that works when I want to compare two columns with the same factor:
# 1 step to create table with frequency counts for exp_sal and curr_sal per category of level
cs_es_table <- df_not_na_num %>%
dplyr::count(sal_level, exp_sal, curr_sal) %>%
tidyr::spread(key = sal_level,value = n) %>% # this code spreads on just one key
select(curr_sal, exp_sal, 1, 2, 3, 4, 5, 6, 7, -8) %>% # reorder columns and omit Column 8 (no answer)
as.data.frame()
# step 2- convert cs_es_table to long format and summarise exp_sal and curr_sal frequencies
cs_es_table <- cs_es_table %>%
gather(key, value, -curr_sal,-exp_sal) %>% # crucial step to make data long
mutate(curr_val = ifelse(curr_sal == 1,value,NA),
exp_val = ifelse(exp_sal == 1,value,NA)) %>% #mutate actually cleans up the data and assigns a value to each new column for 'exp' and 'curr'
group_by(key) %>% #for your summary, because you want to sum up your previous rows which are now assigned a key in a new column
summarise_at( .vars = vars(curr_val, exp_val), .funs = sum, na.rm = TRUE)
This code produces this table but just spreads on one key in step 1:
structure(list(curr_val = c(533L, 448L, 237L, 101L, 56L), exp_val = c(179L,
577L, 725L, 401L, 216L)), row.names = c("< 1000 EUR", "1001-1500 EUR",
"2001-3000 EUR", "3001-4000 EUR", "4001-5000 EUR"), class = "data.frame")
Will I need to use pivot_wider as in this example?
Is it possible to use spread on multiple columns in tidyr similar to dcast?
or
tidyr::spread() with multiple keys and values
Any help would be appreciated to compare the two columns with different factors.

Calculating the median of a time series, by 8 every 8 hours

I am new to R and I do have to calculate the mean of time series, containing 5 years, with hourly taken data of ozon etc..
My df looks like:
structure(list(date = structure(c(1L, 1L, 1L, 1L), .Label = "01.01.2010", class = "factor"),
day.of = c(1L, 1L, 1L, 1L), time = structure(1:4, .Label = c("00:00",
"01:00", "02:00", "03:00"), class = "factor"), SVF_Ray = c(1L,
1L, 1L, 1L), Gmax = c(0, 0, 0, 0), Ta = c(-1.3, -1.2, -1.2,
-1.2), Tmrt = c(-19.3, -12.1, -12, -12.1), PET = c(-10.4,
-8.7, -8.7, -8.7), PT = c(-11.3, -9.3, -9.3, -9.3), Ozon = c(61.35,
62.65, 63.4, 63.85), rDatum = structure(c(14610, 14610, 14610,
14610), class = "Date"), year = c(2010, 2010, 2010, 2010),
month = c(1, 1, 1, 1), day = c(1, 1, 1, 1), hour = c(0, 1,
2, 3)), .Names = c("date", "day.of", "time", "SVF_Ray", "Gmax",
"Ta", "Tmrt", "PET", "PT", "Ozon", "rDatum", "year", "month",
"day", "hour"), row.names = c(NA, 4L), class = "data.frame")
I would like to calculate the mean of Ozon every 8 hours, so a series of 4 calculated means for every day. I have arranged my datum like:
Datum_Ozon$rDatum <- as.Date(data$date, format="%d.%m.%Y")
Datum_Ozon$hour<-as.numeric(unlist(strsplit(as.character(df$time), ":"))[seq(1, 2 * length(df$time), 2)])
Format is numeric
But I don't know any further in achieving my goal. Thanks in advance!
If its the case that your data is regular and complete (ie, every hour has a record), the following base R code should do the trick:
# Get the number of 8 hour intervals
intervalCnt <- nrow(df) / 8L
# add a grouping vector to your data
df$group <- rep(1:intervalCnt, each=8)
# get the median for each interval, keep year var around for later
intervalMedian <- aggregate(var~group + day + month + year, data=df, FUN=median)
Note that this solution relies on the assumption that the data has a regular structure, i.e., every hour has a record. If the measure of interest is missing, i.e. NA, then simply adding na.rm to the aggregate function will return the statistics of interest:
# get the median for each interval
intervalMedian <- aggregate(var~group + day + month + year, data=df, FUN=median, na.rm=T)
If you have a variable for hour of the day, here is a simple way to check for data regularity:
table(df$hourOfDay)
The result of this function is a frequency count of each hour. The counts should be equal. Another thing to check is that the first observation starts in the hour following the final observation, i.e. if the hour of observation 1 == "00:00", then the hour of the final observation should be 23:00.
To provide a plot of the mean of the 8 hour periods by year, you can again use aggregate:
intervalMeans.year <- aggregate(var~group, data=intervalMedian,
FUN=mean, na.rm=T)
The inclusion of the group, day, month, and year variables in the intervalMedian data.frame allow for a lot of different aggregations. For example, with a minor adjustment, it is possible to get the average value of a variable over the 5 year period for each time period-day-month:
intervalMedian$periodDay <- rep(1:3, length.out=intervalMedian)
intervalMeans.dayMonthPeriod <- aggregate(var~periodDay+day+month,
data=intervalMedian, FUN=mean, na.rm=T)
Here is a basic example using a dplyr pipe rather than a plyr approach as well as ifelse(). Everything is self contained here:
library(dplyr)
## OP data
df <-
structure(list(date = structure(c(1L, 1L, 1L, 1L), .Label = "01.01.2010", class = "factor"),
day.of = c(1L, 1L, 1L, 1L), time = structure(1:4, .Label = c("00:00",
"01:00", "02:00", "03:00"), class = "factor"), SVF_Ray = c(1L,
1L, 1L, 1L), Gmax = c(0, 0, 0, 0), Ta = c(-1.3, -1.2, -1.2,
-1.2), Tmrt = c(-19.3, -12.1, -12, -12.1), PET = c(-10.4,
-8.7, -8.7, -8.7), PT = c(-11.3, -9.3, -9.3, -9.3), Ozon = c(61.35,
62.65, 63.4, 63.85), rDatum = structure(c(14610, 14610, 14610,
14610), class = "Date"), year = c(2010, 2010, 2010, 2010),
month = c(1, 1, 1, 1), day = c(1, 1, 1, 1), hour = c(0, 1,
2, 3)), .Names = c("date", "day.of", "time", "SVF_Ray", "Gmax",
"Ta", "Tmrt", "PET", "PT", "Ozon", "rDatum", "year", "month",
"day", "hour"), row.names = c(NA, 4L), class = "data.frame")
df %>%
mutate(DayChunk=ifelse(hour %in% c(0:7),"FirstThird",
ifelse(hour %in% c(8:15), "SecondThird"
,"ThirdThird")
)) %>%
group_by(Date, DayChunk) %>%
summarise(MedOzon=median(Ozon))
Look up the function seq.POSIXt. There are options to specify the start and stop intervals. This function is designed to create sequences of time. For your problem:
myseq<-seq(ISOdate(2010,01,01, 00, 00, 00, tz="GMT"), to=ISOdate(2016,01,05), by = "8 hour")
Use the ISOdate functions to set the start and stop times. If you are going to be working much with times, I suggest researching the function strptime and the POSIXlt/ct time classes.
Now with the breaks defined and assuming you have a column in your dataframe (Datum_Ozon) named "datetime", then use "cut" to group/subset your data.
Datum_Ozon$datetime<-as.POSIXct(paste(as.character(Datum_Ozon$date),
as.character(Datum_Ozon$time)), "%d.%m.%Y %H:%M", tz="GMT" )
library(dplyr)
summarize(group_by(Datum_Ozon, cut(Datum_Ozon$datetime, myseq)), mean(Ozon))

Passing current value of ddply split on to function

Here is some sample data for which I want to encode the gender of the names over time:
names_to_encode <- structure(list(names = structure(c(2L, 2L, 1L, 1L, 3L, 3L), .Label = c("jane", "john", "madison"), class = "factor"), year = c(1890, 1990, 1890, 1990, 1890, 2012)), .Names = c("names", "year"), row.names = c(NA, -6L), class = "data.frame")
Here is a minimal set of the Social Security data, limited to just those names from 1890 and 1990:
ssa_demo <- structure(list(name = c("jane", "jane", "john", "john", "madison", "madison"), year = c(1890L, 1990L, 1890L, 1990L, 1890L, 1990L), female = c(372, 771, 56, 81, 0, 1407), male = c(0, 8, 8502, 29066, 14, 145)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA, -6L), .Names = c("name", "year", "female", "male"))
I've defined a function which subsets the Social Security data given a year or range of years. In other words, it calculates whether a name was male or female over a given time period by figuring out the proportion of male and female births with that name. Here is the function along with a helper function:
require(plyr)
require(dplyr)
select_ssa <- function(years) {
# If we get only one year (1890) convert it to a range of years (1890-1890)
if (length(years) == 1) years <- c(years, years)
# Calculate the male and female proportions for the given range of years
ssa_select <- ssa_demo %.%
filter(year >= years[1], year <= years[2]) %.%
group_by(name) %.%
summarise(female = sum(female),
male = sum(male)) %.%
mutate(proportion_male = round((male / (male + female)), digits = 4),
proportion_female = round((female / (male + female)), digits = 4)) %.%
mutate(gender = sapply(proportion_female, male_or_female))
return(ssa_select)
}
# Helper function to determine whether a name is male or female in a given year
male_or_female <- function(proportion_female) {
if (proportion_female > 0.5) {
return("female")
} else if(proportion_female == 0.5000) {
return("either")
} else {
return("male")
}
}
Now what I want to do is use plyr, specifically ddply, to subset the data to be encoded by year, and merge each of those pieces with the value returned by the select_ssa function. This is the code I have.
ddply(names_to_encode, .(year), merge, y = select_ssa(year), by.x = "names", by.y = "name", all.x = TRUE)
When calling select_ssa(year), this command works just fine if I hard code a value like 1890 as the argument to the function. But when I try to pass it the current value for year that ddply is working with, I get an error message:
Error in filter_impl(.data, dots(...), environment()) :
(list) object cannot be coerced to type 'integer'
How can I pass the current value of year on to ddply?
I think you're making things too complicated by trying to do a join inside ddply. If I were to use dplyr I would probably do something more like this:
names_to_encode <- structure(list(name = structure(c(2L, 2L, 1L, 1L, 3L, 3L), .Label = c("jane", "john", "madison"), class = "factor"), year = c(1890, 1990, 1890, 1990, 1890, 2012)), .Names = c("name", "year"), row.names = c(NA, -6L), class = "data.frame")
ssa_demo <- structure(list(name = c("jane", "jane", "john", "john", "madison", "madison"), year = c(1890L, 1990L, 1890L, 1990L, 1890L, 1990L), female = c(372, 771, 56, 81, 0, 1407), male = c(0, 8, 8502, 29066, 14, 145)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA, -6L), .Names = c("name", "year", "female", "male"))
names_to_encode$name <- as.character(names_to_encode$name)
names_to_encode$year <- as.integer(names_to_encode$year)
tmp <- left_join(ssa_demo,names_to_encode) %.%
group_by(year,name) %.%
summarise(female = sum(female),
male = sum(male)) %.%
mutate(proportion_male = round((male / (male + female)), digits = 4),
proportion_female = round((female / (male + female)), digits = 4)) %.%
mutate(gender = ifelse(proportion_female == 0.5,"either",
ifelse(proportion_female > 0.5,"female","male")))
Note that 0.1.1 is still a little finicky about the types of join columns, so I had to convert them. I think I saw some activity on github that suggested that was either fixed in the dev version, or at least something they're working on.

Resources