Remove Redundant row with large number of variable

Remove Redundant row with large number of variable - r

I have data with 33 attribute. 30 of them is variable. And other 3 column is cluster number ,degree and sum of degree. I want to remove duplicate row which have same value from variable 1 until 30. Within duplicate row I want to choose the row which have highest values of sum degree to remain in the data. This coding is run in R. My question is how do I simplify zz.
df_order=dfOrder(rule2,c(33),ascending=FALSE)
df_order2=as_tibble(df_order)
zz=df_order2 %>% distinct(X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,X11,X12,X13,X14,X15,X16,X17,X18,X19,X20,X21,X22,X23,X24,X25,X26,X27,X28,X29,X30,.keep_all = TRUE)

Sample data:
set.seed(42)
dat <- tibble(a=rep(1:2, each=10), b=rep(1:4, each=5), x1=sample(3,size=20,replace=TRUE), x2=sample(3,size=20,replace=TRUE), x3=sample(3,size=20,replace=TRUE))
dat
# # A tibble: 20 x 5
# a b x1 x2 x3
# <int> <int> <int> <int> <int>
# 1 1 1 1 1 3
# 2 1 1 1 3 3
# 3 1 1 1 1 1
# 4 1 1 1 1 1
# 5 1 1 2 2 2
# 6 1 2 2 3 2
# ...truncated...
Brute-force to show what distinct gives you:
distinct(dat, x1, x2, x3, .keep_all = TRUE)
# # A tibble: 14 x 5
# a b x1 x2 x3
# <int> <int> <int> <int> <int>
# 1 1 1 1 1 3
# 2 1 1 1 3 3
# 3 1 1 1 1 1
# 4 1 1 2 2 2
# 5 1 2 2 3 2
# 6 1 2 1 1 2
# 7 1 2 3 2 2
# 8 1 2 3 2 3
# 9 2 3 1 3 2
# 10 2 3 1 3 1
# 11 2 3 2 2 3
# 12 2 4 3 1 2
# 13 2 4 1 2 1
# 14 2 4 3 2 1
Programmatic way, without specifying each of x1 through x3, both work (depending on your preference towards "just use these" or "don't use those"). The first two work in base R and tidyverse equally well, the third is using dplyr::select.
dat[!duplicated(subset(dat, select = -(a:b))),]
dat[!duplicated(subset(dat, select = x1:x3)),]
dat[!duplicated(select(dat, x1:x3)),] # or -(a:b), same
Or perhaps a pipe-looking method:
select(dat, x1:x3) %>%
Negate(duplicated)(.) %>%
which(.) %>%
slice(dat, .)

Using the data from #r2evans post an option is to use splice after converting the column names to symbols
library(dplyr)
dat %>%
distinct(!!! rlang::syms(names(select(., starts_with('x')))), .keep_all = TRUE)
# A tibble: 14 x 5
# a b x1 x2 x3
# <int> <int> <int> <int> <int>
# 1 1 1 1 1 3
# 2 1 1 1 3 3
# 3 1 1 1 1 1
# 4 1 1 2 2 2
# 5 1 2 2 3 2
# 6 1 2 1 1 2
# 7 1 2 3 2 2
# 8 1 2 3 2 3
# 9 2 3 1 3 2
#10 2 3 1 3 1
#11 2 3 2 2 3
#12 2 4 3 1 2
#13 2 4 1 2 1
#14 2 4 3 2 1
From dplyr version >= 1.0.0, we can also use distinct with across
dat %>%
distinct(across(starts_with('x')), .keep_all = TRUE)
# A tibble: 14 x 5
# a b x1 x2 x3
# <int> <int> <int> <int> <int>
# 1 1 1 1 1 3
# 2 1 1 1 3 3
# 3 1 1 1 1 1
# 4 1 1 2 2 2
# 5 1 2 2 3 2
# 6 1 2 1 1 2
# 7 1 2 3 2 2
# 8 1 2 3 2 3
# 9 2 3 1 3 2
#10 2 3 1 3 1
#11 2 3 2 2 3
#12 2 4 3 1 2
#13 2 4 1 2 1
#14 2 4 3 2 1

Related

Use dynamically generated column names in dplyr

I have a data frame with multiple columns, the user provides a vector with the column names, and I want to count maximum amount of times an element appears
set.seed(42)
df <- tibble(
var1 = sample(c(1:3),10,replace=T),
var2 = sample(c(1:3),10,replace=T),
var3 = sample(c(1:3),10,replace=T)
)
select_vars <- c("var1", "var3")
df %>%
rowwise() %>%
mutate(consensus=max(table(unlist(c(var1,var3)))))
# A tibble: 10 x 4
# Rowwise:
var1 var2 var3 consensus
<int> <int> <int> <int>
1 1 1 1 2
2 1 1 3 1
3 1 2 1 2
4 1 2 1 2
5 2 2 2 2
6 2 3 3 1
7 2 3 2 2
8 1 1 1 2
9 3 1 2 1
10 3 3 2 1
This does exactly what I want, but when I try to use a vector of variables i cant get it to work
df %>%
rowwise() %>%
mutate(consensus=max(unlist(table(select_vars)) )))

You can wrap it in c(!!! syms()) to get it working, and you don't need the unlist apparently. But honestly, I'm not sure what you are trying to do, and why table is needed here. Do you just want to check if var2 and var3 are the same value and if then 2 and if not then 1?
library(dplyr)
df <- tibble(
var1 = sample(c(1:3),10,replace=T),
var2 = sample(c(1:3),10,replace=T),
var3 = sample(c(1:3),10,replace=T)
)
select_vars <- c("var2", "var3")
df %>%
rowwise() %>%
mutate(consensus=max(table(c(!!!syms(select_vars)))))
#> # A tibble: 10 x 4
#> # Rowwise:
#> var1 var2 var3 consensus
#> <int> <int> <int> <int>
#> 1 2 3 2 1
#> 2 3 1 3 1
#> 3 3 1 1 2
#> 4 3 3 3 2
#> 5 1 1 2 1
#> 6 2 1 3 1
#> 7 3 2 3 1
#> 8 1 2 3 1
#> 9 2 1 2 1
#> 10 2 1 1 2
Created on 2021-07-22 by the reprex package (v0.3.0)

In the OP's code, we need select
library(dplyr)
df %>%
rowwise() %>%
mutate(consensus=max(table(unlist(select(cur_data(), select_vars))) ))
-output
# A tibble: 10 x 4
# Rowwise:
var1 var2 var3 consensus
<int> <int> <int> <int>
1 1 1 1 2
2 1 1 3 1
3 1 2 1 2
4 1 2 1 2
5 2 2 2 2
6 2 3 3 1
7 2 3 2 2
8 1 1 1 2
9 3 1 2 1
10 3 3 2 1
Or just subset from cur_data() which would only return the data keeping the group attributes
df %>%
rowwise %>%
mutate(consensus = max(table(unlist(cur_data()[select_vars]))))
# A tibble: 10 x 4
# Rowwise:
var1 var2 var3 consensus
<int> <int> <int> <int>
1 1 1 1 2
2 1 1 3 1
3 1 2 1 2
4 1 2 1 2
5 2 2 2 2
6 2 3 3 1
7 2 3 2 2
8 1 1 1 2
9 3 1 2 1
10 3 3 2 1
Or using pmap
library(purrr)
df %>%
mutate(consensus = pmap_dbl(cur_data()[select_vars], ~ max(table(c(...)))))
# A tibble: 10 x 4
var1 var2 var3 consensus
<int> <int> <int> <dbl>
1 1 1 1 2
2 1 1 3 1
3 1 2 1 2
4 1 2 1 2
5 2 2 2 2
6 2 3 3 1
7 2 3 2 2
8 1 1 1 2
9 3 1 2 1
10 3 3 2 1
As these are rowwise operations, can get some efficiency if we use collapse functions
library(collapse)
tfm(df, consensus = dapply(slt(df, select_vars), MARGIN = 1,
FUN = function(x) fmax(tabulate(x))))
# A tibble: 10 x 4
var1 var2 var3 consensus
* <int> <int> <int> <int>
1 1 1 1 2
2 1 1 3 1
3 1 2 1 2
4 1 2 1 2
5 2 2 2 2
6 2 3 3 1
7 2 3 2 2
8 1 1 1 2
9 3 1 2 1
10 3 3 2 1
Benchmarks
As noted above, collapse is faster (run on a slightly bigger dataset)
df1 <- df[rep(seq_len(nrow(df)), 1e5), ]
system.time({
tfm(df1, consensus = dapply(slt(df1, select_vars), MARGIN = 1,
FUN = function(x) fmax(tabulate(x))))
})
#user system elapsed
# 5.257 0.123 5.323
system.time({
df1 %>%
mutate(consensus = pmap_dbl(cur_data()[select_vars], ~ max(table(c(...)))))
})
#user system elapsed
# 54.813 0.517 55.246
The rowwise operation is taking too much time, so stopped the execution
df1 %>%
rowwise() %>%
mutate(consensus=max(table(unlist(select(cur_data(), select_vars))) ))
})
Timing stopped at: 575.5 3.342 581.3

What you need is to use the verb all_of
df %>%
rowwise() %>%
mutate(consensus=max(table(unlist(all_of(select_vars)))))
# A tibble: 10 x 4
# Rowwise:
var1 var2 var3 consensus
<int> <int> <int> <int>
1 2 3 3 1
2 2 2 2 1
3 1 2 2 1
4 2 3 3 1
5 1 2 1 1
6 2 1 2 1
7 2 2 2 1
8 3 1 2 1
9 2 1 3 1
10 3 2 1 1

How to recognize unknown patterns in data frame by row?

I have a data frame where I have agricultural use codes (1-5) for 15 consecutive years. Each row is a polygon representing a field. Ultimately I need R to loop through the rows and recognize patterns of use and tell me their respective frequency. Unfortunately in my real data set I have over 1 mio. features and thus all possible patterns are not known.
a <- data.frame(replicate(15, sample(0:5,500,rep=TRUE)))
colnames(a) <- paste0("use",2005:2019)
id <- c(1:500)
a <- cbind(id,a)
id use2005 use2006 use2007 use2008 use2009 use2010 use2011 use2012 use2013 use2014 use2015 ...
1 1 1 1 1 1 2 2 1 4 4 4 ...
2 4 4 4 4 5 5 5 0 5 5 5 ...
3 1 4 3 2 3 2 4 5 1 1 1 ...
4 1 1 1 1 1 2 2 1 4 4 4 ...
5 4 2 2 2 2 5 3 3 3 3 3 ...
So in this arbitrary example, the code should recognize that id 1 & 4 have the same pattern.
In the end I imagine the result to be some sort of frequency distribution to see if there are certain patterns in the agricultural use of my fields.
For example:
1 1 1 1 1 2 1 1 1 3 2 4 1 1 1
[50] - occurs 50 times
5 5 5 5 5 1 1 1 1 4 4 4 2 2 3
[35] - occurs 35 times
and so forth with all existing combinations...
Unfortunately I have no idea how to approach this. I have no experience with pattern recognition.
Thank you!

maybe this?
library(tidyverse)
a[, -1] %>% group_by_all %>% count
# use2005 use2006 use2007 use2008 use2009 use2010 use2011 use2012 use2013 use2014 use2015 n
# <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
# 1 1 1 1 1 1 2 2 1 4 4 4 2
# 2 1 4 3 2 3 2 4 5 1 1 1 1
# 3 4 2 2 2 2 5 3 3 3 3 3 1
# 4 4 4 4 4 5 5 5 0 5 5 5 1
or if you want to include fields you could change to group_by_at and exclude id from the grouping and then paste fields together:
a %>%
group_by_at(vars(-id)) %>%
summarise(n = n(), ids = paste(id, collapse= "," ))
# use2005 use2006 use2007 use2008 use2009 use2010 use2011 use2012 use2013 use2014 use2015 n ids
# <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <chr>
# 1 1 1 1 1 1 2 2 1 4 4 4 2 1,4
# 2 1 4 3 2 3 2 4 5 1 1 1 1 3
# 3 4 2 2 2 2 5 3 3 3 3 3 1 5
# 4 4 4 4 4 5 5 5 0 5 5 5 1 2

Here's an example on how to approach this, using a small example dataset (i.e. the one you posted).
library(tidyverse)
# example dataset
a = read.table(text = "
id use2005 use2006 use2007 use2008 use2009 use2010 use2011 use2012 use2013 use2014 use2015
1 1 1 1 1 1 2 2 1 4 4 4
2 4 4 4 4 5 5 5 0 5 5 5
3 1 4 3 2 3 2 4 5 1 1 1
4 1 1 1 1 1 2 2 1 4 4 4
5 4 2 2 2 2 5 3 3 3 3 3
", header=T)
a %>%
group_nest(id) %>% # for each row
mutate(pattern = map(data, ~paste(.x, collapse = ","))) %>% # create the pattern as a string
unnest(pattern) %>% # unnest pattern column
count(pattern, sort = T) # count patterns
# # A tibble: 4 x 2
# pattern n
# <chr> <int>
# 1 1,1,1,1,1,2,2,1,4,4,4 2
# 2 1,4,3,2,3,2,4,5,1,1,1 1
# 3 4,2,2,2,2,5,3,3,3,3,3 1
# 4 4,4,4,4,5,5,5,0,5,5,5 1

is there a way in R to fill missing groups absent of observations?

Say I have something like:
df<-data.frame(group=c(1, 1,1, 2,2,2,3,3,3,4,4, 1, 1,1),
group2=c(1,2,3,1,2,3,1,2,3,1,3, 1,2,3))
group group2
1 1 1
2 1 2
3 1 3
4 2 1
5 2 2
6 2 3
7 3 1
8 3 2
9 3 3
10 4 1
11 4 3
12 1 1
13 1 2
14 1 3
My goal is to count the number of unique instances for group= something and group2= something. Like so:
df1<-df%>%group_by(group, group2)%>% mutate(want=n())%>%distinct(group, group2, .keep_all=TRUE)
group group2 want
<dbl> <dbl> <int>
1 1 1 2
2 1 2 2
3 1 3 2
4 2 1 1
5 2 2 1
6 2 3 1
7 3 1 1
8 3 2 1
9 3 3 1
10 4 1 1
11 4 3 1
however, notice that group=4, group2=2 was not in my dataset to begin with. Is there some sort of autofill function where I can fill these non-observations with a zero to get below easily?:
group group2 want
<dbl> <dbl> <int>
1 1 1 2
2 1 2 2
3 1 3 2
4 2 1 1
5 2 2 1
6 2 3 1
7 3 1 1
8 3 2 1
9 3 3 1
10 4 1 1
11 4 2 0
12 4 3 1

After getting the count, we can expand with complete to fill the missing combinations with 0
library(dplyr)
library(tidyr)
df %>%
count(group, group2) %>%
complete(group, group2, fill = list(n = 0))
# A tibble: 12 x 3
# group group2 n
# <dbl> <dbl> <dbl>
# 1 1 1 2
# 2 1 2 2
# 3 1 3 2
# 4 2 1 1
# 5 2 2 1
# 6 2 3 1
# 7 3 1 1
# 8 3 2 1
# 9 3 3 1
#10 4 1 1
#11 4 2 0
#12 4 3 1
Or if we do the group_by, instead of mutate and then do the distinct, directly use the summarise
df %>%
group_by(group, group2) %>%
summarise(n = n()) %>%
ungroup %>%
complete(group, group2, fill = list(n = 0))

Here is a data.table approach solution to this problem:
library(data.table)
setDT(df)[CJ(group, group2, unique = TRUE),
c(.SD, .(want = .N)), .EACHI,
on = c("group", "group2")]
# group group2 want
# 1 1 2
# 1 2 2
# 1 3 2
# 2 1 1
# 2 2 1
# 2 3 1
# 3 1 1
# 3 2 1
# 3 3 1
# 4 1 1
# 4 2 0
# 4 3 1

How to arrange/sort by unique sequences?

A) Here is my data frame arranged by plate:
df <- read.table(header=TRUE, stringsAsFactors=FALSE, text="
plate phase score
A 1 1
A 1 1
A 1 1
A 2 1
A 2 1
A 2 1
A 3 2
A 3 2
A 3 2
B 1 1
B 1 1
B 1 2
B 2 1
B 2 1
B 2 3")
B) Goal: I want to order it by both plate first and then phase but sequentially (see below how the rows are ordered alphabetically by plate but sequentially by phase
plate phase score
<chr> <int> <int>
1 A 1 1
2 A 2 1
3 A 3 2
4 A 1 1
5 A 2 1
6 A 3 2
7 A 1 1
8 A 2 1
9 A 3 2
10 B 1 1
11 B 2 1
12 B 1 1
13 B 2 1
14 B 1 2
15 B 2 3

One option is to create a sequence variable grouped by 'plate', 'phase' and arrange on it along with 'plate' and 'score'
library(dplyr)
df %>%
group_by(plate, phase) %>%
mutate(rn = row_number()) %>%
ungroup %>%
arrange(plate, rn, score) %>%
select(-rn)
# A tibble: 15 x 3
# plate phase score
# <chr> <int> <int>
# 1 A 1 1
# 2 A 2 1
# 3 A 3 2
# 4 A 1 1
# 5 A 2 1
# 6 A 3 2
# 7 A 1 1
# 8 A 2 1
# 9 A 3 2
#10 B 1 1
#11 B 2 1
#12 B 1 1
#13 B 2 1
#14 B 1 2
#15 B 2 3
Or using data.table
library(data.table)
setDT(df)[order(plate, rowid(phase), score)]

df[with(df, order(plate, ave(phase, phase, FUN = seq_along), phase)),]
#> plate phase score
#> 1 A 1 1
#> 4 A 2 1
#> 7 A 3 2
#> 2 A 1 1
#> 5 A 2 1
#> 8 A 3 2
#> 3 A 1 1
#> 6 A 2 1
#> 9 A 3 2
#> 10 B 1 1
#> 13 B 2 1
#> 11 B 1 1
#> 14 B 2 1
#> 12 B 1 2
#> 15 B 2 3

repeat list in to a data frame in R

I have a list let's say
k<-c(1,2,3,4)
I want to create a dataframe with let's say 5 rows using the same list in each row as shown below.
X1 X2 X3 X4
1 1 2 3 4
2 1 2 3 4
3 1 2 3 4
4 1 2 3 4
5 1 2 3 4
I tried doing:-
> rep(k, each = 5)
[1] 1 1 1 1 1 2 2 2 2 2 3 3 3 3 3 4 4 4 4 4
However I am not able to get intended result. Any suggestions?

data.frame(t(replicate(5, k)))
#OR
data.frame(matrix(rep(k, each = 5), 5))
#OR
data.frame(t(sapply(1:5, function(x) k)))
# X1 X2 X3 X4
#1 1 2 3 4
#2 1 2 3 4
#3 1 2 3 4
#4 1 2 3 4
#5 1 2 3 4

Here is one option by converting the vector to list with as.list, change it to data.frame (as.data.frame and replicate the rows
as.data.frame(as.list(k))[rep(1, 5),]
# X1 X2 X3 X4
#1 1 2 3 4
#1.1 1 2 3 4
#1.2 1 2 3 4
#1.3 1 2 3 4
#1.4 1 2 3 4
Or another option is to take the transpose of the vector to get a row matrix, replicate the rows and convert to data.frame
as.data.frame(t(k)[rep(1, 5),])
In tidyverse, one option is to convert to tibble and then uncount
library(dplyr)
library(tidyr)
library(stringr)
as.list(k) %>%
set_names(str_c("X", seq_along(k))) %>%
as_tibble %>%
uncount(5)
# A tibble: 5 x 4
# X1 X2 X3 X4
# <dbl> <dbl> <dbl> <dbl>
#1 1 2 3 4
#2 1 2 3 4
#3 1 2 3 4
#4 1 2 3 4
#5 1 2 3 4

purrr::map_dfc(k, rep, 5)
# # A tibble: 5 x 4
# V1 V2 V3 V4
# <dbl> <dbl> <dbl> <dbl>
# 1 1 2 3 4
# 2 1 2 3 4
# 3 1 2 3 4
# 4 1 2 3 4
# 5 1 2 3 4

Using data.table:
k = c(1,2,3,4)
n = 5 # Number of rows
df = data.table()
df = df[, lapply(1:length(k), function(x) rep(k[x], n))]
> df
V1 V2 V3 V4
1: 1 2 3 4
2: 1 2 3 4
3: 1 2 3 4
4: 1 2 3 4
5: 1 2 3 4

Develop Reference

r css asp.net wordpress firebase qt symfony nginx http apache-flex

Remove Redundant row with large number of variable - r

Related

Use dynamically generated column names in dplyr

How to recognize unknown patterns in data frame by row?

is there a way in R to fill missing groups absent of observations?

How to arrange/sort by unique sequences?

repeat list in to a data frame in R

Categories

Resources