Find duplicate values and have references

Find duplicate values and have references - r

My data = data.lab
data.lab <- data.frame(Name=c("A","e","b","c","d"),
bp =c( 12,12,11,12,11),
sugar = c(19,21,23,19,23))
I want to have only duplicate names with the reference
desired output
lab.data <- data.frame(Name=c("A","b","c","d"),
bp =c( 12,11,12,11),
sugar = c(19,23,19,23),
pair=c(1,1,2,2))
dub.data <- duplicated(data.lab) | duplicated(data.lab, fromLast = TRUE)
out.1=data.lab[dub.data, ]
this gives the duplicate data but i need a column as what are the duplicate pairs

With dplyr, you can do:
data.lab %>%
group_by(bp, sugar) %>%
filter(n() == 2) %>%
mutate(pair = seq_along(Name))
Name bp sugar pair
<fct> <dbl> <dbl> <int>
1 A 12 19 1
2 b 11 23 1
3 c 12 19 2
4 d 11 23 2
Or:
data.lab %>%
group_by(bp, sugar) %>%
filter(n() == 2) %>%
mutate(pair = row_number())
Or if there could be more than two pairs of duplicates:
data.lab %>%
group_by(bp, sugar) %>%
filter(n() > 1) %>%
mutate(pair = seq_along(Name))
Or:
data.lab %>%
group_by(bp, sugar) %>%
filter(n() > 1) %>%
mutate(pair = row_number())
Or to group by all variables except of "Name":
data.lab %>%
group_by_at(vars(-matches("(Name)"))) %>%
filter(n() > 1) %>%
mutate(pair = seq_along(Name))
Or:
data.lab %>%
group_by_at(vars(-matches("(Name)"))) %>%
filter(n() > 1) %>%
mutate(pair = row_number())

Continuing from your approach , we can use ave in base R
dat1 <- data.lab[duplicated(data.lab[c("bp", "sugar")]) |
duplicated(data.lab[c("bp", "sugar")], fromLast = TRUE) , ]
dat1$pair <- with(dat1, ave(Name, bp, sugar, FUN = seq_along))
dat1
# Name bp sugar pair
#1 A 12 19 1
#2 b 11 23 1
#3 c 12 19 2
#4 d 11 23 2

Related

How to append dataframe column names with for-loop iteration count in R?

This is my first-time trying a for-loop in R. I am trying to append to the looped-over data frame column header names, the loop iteration count as shown below in the image. Columns concat, alloc, merge, reSeq, are generated in the loop and I'd like to change their names to concat_1, alloc_1, merge_1, reSeq_1 for the first loop, etc. Any recommendations for how to do this?
I'm starting very easy, looping only 1 time so I can gently step into this. In my next step I'll expand the loop so that columns are added to the right and similarly sequentially renamed. I did attempt to append to column names in the first mutate under the for-loop in the below code for each reference to concat but it doesn't work: mutate(paste0("concat_",i) = as.numeric... So I imagine the solution will be something like running colnames(...) at the bottom of the loop, etc.
Here's the code with example DF:
library(dplyr)
myDF1 <- data.frame(
Name = c("R","R","B","R","X","X"),
Group = c(0,0,0,0,1,1))
nCode <- myDF1 %>%
group_by(Name) %>%
mutate(nmCnt = row_number()) %>%
ungroup() %>%
mutate(seqBase = ifelse(Group == 0 | Group != lag(Group), nmCnt,0)) %>%
mutate(seqBase = na_if(seqBase, 0)) %>%
group_by(Name) %>%
fill(seqBase) %>%
mutate(seqBase = match(seqBase, unique(seqBase))) %>%
ungroup %>%
mutate(grpRnk = ifelse(Group > 0, sapply(1:n(), function(x) sum(Name[1:x]==Name[x] & Group[1:x] == Group[x])),0))
loopCntr <- nrow(unique(myDF1[myDF1$Group!=0,]))
for(i in 1:1) {nCode <- nCode %>%
mutate(concat = as.numeric(paste0(seqBase,".",grpRnk)))
index <- filter(nCode, Group !=0) %>%
select(concat) %>%
distinct() %>%
mutate(truncInd = trunc(concat)) %>%
group_by(truncInd) %>%
mutate(cumGrp = cur_group_id()) %>%
ungroup() %>%
select(-truncInd)
index <- if(ifelse(loopCntr > 0, min(index$concat), Inf) >= 2){
rbind(data.frame(concat=c(1),cumGrp=c(1)),index)}else{index}
nCode <- nCode %>%
mutate(alloc = index$concat[index$cumGrp==1][nmCnt]) %>%
mutate(merge = ifelse(is.na(alloc),seqBase,alloc)) %>%
group_by(Name) %>%
mutate(reSeq = match(trunc(merge), unique(trunc(merge)))) %>%
mutate(reSeq = (reSeq + round(merge%%1 * 10,0)/10)) %>%
ungroup()
} # end for-loop
print.data.frame(nCode)

Perhaps, assign with names<- or setNames or use rename_with at the end of the loop
library(dplyr)
library(stringr)
for(i in 1:1) {nCode <- nCode %>%
mutate(concat = as.numeric(paste0(seqBase,".",grpRnk)))
index <- filter(nCode, Group !=0) %>%
select(concat) %>%
distinct() %>%
mutate(truncInd = trunc(concat)) %>%
group_by(truncInd) %>%
mutate(cumGrp = cur_group_id()) %>%
ungroup() %>%
select(-truncInd)
index <- if(ifelse(loopCntr > 0, min(index$concat), Inf) >= 2){
rbind(data.frame(concat=c(1),cumGrp=c(1)),index)}else{index}
nCode <- nCode %>%
mutate(alloc = index$concat[index$cumGrp==1][nmCnt]) %>%
mutate(merge = ifelse(is.na(alloc),seqBase,alloc)) %>%
group_by(Name) %>%
mutate(reSeq = match(trunc(merge), unique(trunc(merge)))) %>%
mutate(reSeq = (reSeq + round(merge%%1 * 10,0)/10)) %>%
ungroup()
nCode <- nCode %>%
rename_with(~ str_c(.x, "_", i), c("concat", "alloc", "merge", "reSeq"))
} # end for-loop
-output
> nCode
# A tibble: 6 × 9
Name Group nmCnt seqBase grpRnk concat_1 alloc_1 merge_1 reSeq_1
<chr> <dbl> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
1 R 0 1 1 0 1 1.1 1.1 1.1
2 R 0 2 2 0 2 1.2 1.2 1.2
3 B 0 1 1 0 1 1.1 1.1 1.1
4 R 0 3 3 0 3 NA 3 2
5 X 1 1 1 1 1.1 1.1 1.1 1.1
6 X 1 2 1 2 1.2 1.2 1.2 1.2
Or another option is using := while assignment
...
mutate(!! paste0("alloc_", i) := index$concat[index$cumGrp==1][nmCnt])%>%
...

Summarize occurrences by area and then by custom groups

I have below dataset that takes a 2 column dataset and creates age group categories depending on stated CustomerAge.
library(tidyverse)
df <-
read.table(textConnection("Area CustomerAge
A 28
A 40
A 70
A 19
B 13
B 12
B 72
B 90"), header=TRUE)
df2 <- df %>%
mutate(
# Create categories
Customer_Age_Group = dplyr::case_when(
CustomerAge <= 18 ~ "0-18",
CustomerAge > 18 & CustomerAge <= 60 ~ "19-60",
CustomerAge > 60 ~ ">60"
))
What I am looking to achieve is an output summary that looks like the below:
Area
Customer_Age_Group
Occurrences
A
0-18
0
A
19-59
3
A
>60
1
B
0-18
2
B
19-59
0
B
>60
2

To include also 0 occurences you need count(), ungroup() and complete():
df2 %>% group_by(Area, Customer_Age_Group,.drop = FALSE) %>%
count() %>%
ungroup() %>%
complete(Area, Customer_Age_Group, fill=list(n=0))
This will show also 0 occurences.
To sort for Area and Age group:
df2 %>% group_by(Area, Customer_Age_Group,.drop = FALSE) %>%
count() %>%
ungroup() %>%
complete(Area, Customer_Age_Group, fill=list(n=0)) %>%
arrange(Area, parse_number(Customer_Age_Group))

group_by and summarise is what you're looking for.
df2 %>% group_by(Area, Customer_Age_Group) %>% summarise(Occurences = n())
However note that this won't show categories with zero occurences in your data set.

creating dataframe from vectors

enter image description hereI have the following vectors:
bid = c(1,5,10,20,30,40,50)
n = c(31,29,27,25,23,21,19)
yes = c(0,3,6,7,9,13,17)
no = n - yes
I have two questions, and I don't find any solutions for them, I would appreciate if someone can help me.
Q1: I want to write R code to create a two-column dataframe df. Column 1 has Bid,
where each Bid is repeated n times; Column 2 has c(rep(1,yes),rep(0,no) at
each bid.
Q2: Then when I have the data frame df, I want to write R codes to generate
(from df) vectors bid, n, yes, and no, again.

It is a bit unclear what you actually want. It is easier if you provide the desired result. Would this fit your Q1:
library(tidyverse)
bid = c(1,5,10,20,30,40,50)
n = c(31,29,27,25,23,21,19)
yes = c(0,3,6,7,9,13,17)
no = n - yes
df <- tibble(bid, yes, n, no = n -yes) %>% dplyr::select(- n) %>% pivot_longer(cols = c(yes, no)) %>% uncount(value) %>% mutate(yesno = ifelse(name == "yes", 1,0)) %>% dplyr::select(-name)
df2 <- df %>% group_by(bid) %>% table() %>% as.data.frame() %>% pivot_wider(id_cols = bid, names_from = yesno, values_from = Freq) %>% mutate(n = yes + no) %>% rename(no = `0`, yes = `1`)
bid <- df2$bid
n <- df2$n
yes <- df2$yes

I don't know what you mean for Q2, but for Q1 you could do this:
library(tidyverse)
pmap_dfr(list(bid, n, yes, no),
\(V1, V2, V3, V4) tibble(col1 = rep(V1, V2),
col2 = c(rep(1,V3),rep(0,V4))))
#> # A tibble: 175 x 2
#> col1 col2
#> <dbl> <dbl>
#> 1 1 0
#> 2 1 0
#> 3 1 0
#> 4 1 0
#> 5 1 0
#> 6 1 0
#> 7 1 0
#> 8 1 0
#> 9 1 0
#> 10 1 0
#> # ... with 165 more rows
EDIT:
For Q2, you can follow this:
library(tidyverse)
df <- pmap_dfr(list(bid, n, yes, no),
\(V1, V2, V3, V4) tibble(col1 = rep(V1, V2),
col2 = c(rep(1,V3),rep(0,V4))))
df2 <- df |>
count(col1, col2) |>
group_by(col1) |>
summarise(yes = sum(n[col2==1]),
n = sum(n))
bid2 <- df2$col1
n2 <- df2$n
yes2 <- df2$yes
no2 <- n2 - yes2
all.equal(c(bid, n, yes, no), c(bid2, n2, yes2, no2))
#> [1] TRUE

Filter data by group & preserve empty groups

I wonder how can I filter my data by group, and preserve the groups that are empty?
Example:
year = c(1,2,3,1,2,3,1,2,3)
site = rep(c("a", "b", "d"), each = 3)
value = c(3,3,0,1,8,5,10,18,27)
df <- data.frame(year, site, value)
I want to subset the rows where the value is more than 5. For some groups, this is never true. Filter function simply skips empty groups.
How can I keep my empty groups and have NA instead? Ideally, I would like to use dplyr funtions instead of base R.
My filtering approach, where .preserve does not preserve empty groups:
df %>%
group_by(site) %>%
filter(value > 5, .preserve = TRUE)
Expected output:
year site value
<dbl> <fct> <dbl>
1 NA a NA
2 2 b 8
3 1 d 10
4 2 d 18
5 3 d 27

With the addition of tidyr, you can do:
df %>%
group_by(site) %>%
filter(value > 5) %>%
ungroup() %>%
complete(site = df$site)
site year value
<fct> <dbl> <dbl>
1 a NA NA
2 b 2 8
3 d 1 10
4 d 2 18
5 d 3 27
Or if you want to keep it in dplyr:
df %>%
group_by(site) %>%
filter(value > 5) %>%
bind_rows(df %>%
group_by(site) %>%
filter(all(value <= 5)) %>%
summarise_all(~ NA))

Using the nesting functionality of tidyr and applying purrr::map
df %>%
group_by(site) %>%
tidyr::nest() %>%
mutate(data = purrr::map(data, . %>% filter(value > 5))) %>%
tidyr::unnest(cols=c(data), keep_empty = TRUE)

R Sum numbers within string

I have a question:
I have a dataset like this simple example:
df<-data.frame(ID=c("A","B","C","D"),
Score=c("15","16/18/19+2/6","3/+2","19/18/14"))
I want to end up with a dataset that has split the score numbers. I have a problem with the /+2 part. when it says "3/+2"it actually means: "3/3+2" which would finally give "3/5". So what I would like some help with, is to end up with a dataset like this:
ID Score
A 15
B 16/18/19/21/6
C 3/5
D 19/18/14
I already found out that I can then seperate the score by
df<-df %>%
mutate(Score = strsplit(as.character(ID), "/")) %>%
unnest(Score)
But I don't know how I can let the numbers duplicate and then sum when /+ occurs, could someone help me?

It could be probably solved in a more elegant way, but here is one possibility:
df %>%
mutate(Score = strsplit(as.character(Score), "/")) %>%
unnest() %>%
rowwise() %>%
mutate(Score = eval(parse(text = paste0(Score)))) %>%
group_by(ID) %>%
mutate(Score = paste0(Score, collapse = "/")) %>%
distinct()
ID Score
<fct> <chr>
1 A 15
2 B 16/18/21/6
3 C 3/5
4 D 19/18/14
Sample data:
df <- data.frame(ID=c("A","B","C","D"),
Score=c("15","16/18/19+2/6","3/3+2","19/18/14"))
It splits "Score" based on /, converts characters to expression by parse() and then transforms it back.
Using the data you provided and the pattern from #A. Suliman:
df %>%
mutate(Score = strsplit(gsub("(\\d+)/*\\+(\\d+)","\\1/\\1+\\2", Score), "/")) %>%
unnest() %>%
rowwise() %>%
mutate(Score = eval(parse(text = paste0(Score)))) %>%
group_by(ID) %>%
mutate(Score = paste0(Score, collapse = "/")) %>%
distinct()
ID Score
<fct> <chr>
1 A 15
2 B 16/18/19/21/6
3 C 3/5
4 D 19/18/14

We can use gsubfn to do this in a compact way
library(gsubfn)
library(tidyverse)
df %>%
mutate(Score = gsubfn("\\d+\\+\\d+", ~ eval(parse(text = x)), Score))
# ID Score
#1 A 15
#2 B 16/18/21/6
#3 C 3/5
#4 D 19/18/14
data
df <- data.frame(ID=c("A","B","C","D"),
Score=c("15","16/18/19+2/6","3/3+2","19/18/14"), stringsAsFactors = FALSE)

library(dplyr)
library(tidyr) #separate_rows, no need for unnest
df %>% rowwise()%>%
mutate(Score_upd=paste0(sapply(unlist(strsplit(gsub('(\\d+)/*\\+(\\d+)','\\1/\\1+\\2',Score),'/')),
function(x)eval(parse(text = x))),collapse = '/')) %>%
separate_rows(Score_upd,sep = '/')
#short version
df %>% mutate(Score=gsub('(\\d+)/*\\+(\\d+)','\\1/\\1+\\2',Score)) %>%
separate_rows(Score,sep='/') %>% rowwise() %>% mutate(Score=eval(parse(text=Score))) %>%
group_by(ID) %>% summarise(Score=paste0(Score,collapse = '/'))
# A tibble: 4 x 2
ID Score
<fct> <chr>
1 A 15
2 B 16/18/19/21/6
3 C 3/5
4 D 19/18/14
The main idea is using gsub to separate 2+3 correctly, e.g:
gsub('(\\d+)/*\\+(\\d+)','\\1/\\1+\\2','20/8/2+3') #/* means 0 or 1 occurence of / e.g, 19+2 and 3/+2.
[1] "20/8/2/2+3"
Then
valid_str <- gsub('(\\d+)/*\\+(\\d+)','\\1/\\1+\\2','20/8/2+3')
sapply(unlist(strsplit(valid_str,'/')),function(x) eval(parse(text=x)))
20 8 2 2+3
20 8 2 5
#OR
sapply(unlist(strsplit(valid_str,'/')),function(x) sum(as.numeric(unlist(strsplit(x,'\\+')))))
20 8 2 2+3
20 8 2 5

Develop Reference

r css asp.net wordpress firebase qt symfony nginx http apache-flex

Find duplicate values and have references - r

Related

How to append dataframe column names with for-loop iteration count in R?

Summarize occurrences by area and then by custom groups

creating dataframe from vectors

Filter data by group & preserve empty groups

R Sum numbers within string

Categories

Resources