Performing row wise normalization of data in r - r

Simple operation I would like to do which is proving not to be so simple. So I have a time series data set, and I would like to perform row wise normalization, so for each observation, (x- mean(row))/stdev(row).
This was one attempt but to no avail, and also I've replaced NA values with 0 so that doesn't seem to be the issue.
norm <- for (i in 1:nrow(clusterdatairaq2)){
for(j in 2:ncol(clusterdatairaq2)) {
clusterdatairaq2[i,j] <- (clusterdatairaq2[i,j] - mean(clusterdatairaq2[i,]))/ sd(clusterdatairaq2[i,])
}
}
Thanks in advance for any help!!

Assuming we have a data frame like this:
library(dplyr)
df = tibble(
Destination = c("Belgium", "Bulgaria", "Czechia"),
`Jan 2008` = sample(1:1000, size=3),
`Feb 2008` = sample(1:1000, size=3),
`Mar 2008` = sample(1:1000, size=3)
)
df
# A tibble: 3 × 4
Destination `Jan 2008` `Feb 2008` `Mar 2008`
<chr> <int> <int> <int>
1 Belgium 811 299 31
2 Bulgaria 454 922 421
3 Czechia 638 709 940
The tidyverse way to do this (which I think is better than base R here)
library(dplyr)
library(tidyr)
scaled = df %>%
pivot_longer(`Jan 2008`:`Mar 2008`) %>%
group_by(Destination) %>%
mutate(value = as.numeric(scale(value))) %>%
ungroup()
scaled
Destination name value
<chr> <chr> <dbl>
1 Belgium Jan 2008 1.09
2 Belgium Feb 2008 -0.205
3 Belgium Mar 2008 -0.881
4 Bulgaria Jan 2008 -0.517
5 Bulgaria Feb 2008 1.15
6 Bulgaria Mar 2008 -0.635
7 Czechia Jan 2008 -0.787
8 Czechia Feb 2008 -0.338
9 Czechia Mar 2008 1.13
Now, you could pivot it back to the original form, but there's not much point, because analysis will be much easier in long form:
scaled %>% pivot_wider(names_from=name, values_from=value)
# A tibble: 3 × 4
Destination `Jan 2008` `Feb 2008` `Mar 2008`
<chr> <dbl> <dbl> <dbl>
1 Belgium 1.09 -0.205 -0.881
2 Bulgaria -0.517 1.15 -0.635
3 Czechia -0.787 -0.338 1.13

set.seed(42)
mtx <- matrix(sample(99, size=6*5, replace=TRUE), nrow=6)
df <- cbind(data.frame(id = letters[1:6]), mtx)
df
# id A B C D E
# 1 a 49 47 26 95 58
# 2 b 65 24 3 5 97
# 3 c 25 71 41 84 42
# 4 d 74 89 89 34 24
# 5 e 18 37 27 92 30
# 6 f 49 20 36 3 43
out <- t(apply(df[,-1], 1, function(X) (X-mean(X)) / sd(X)))
colnames(out) <- paste0(colnames(df[,-1]), "_norm")
df <- cbind(df, out)
df
# id A B C D E A_norm B_norm C_norm D_norm E_norm
# 1 a 49 47 26 95 58 -0.2376354 -0.3168472 -1.1485711 1.5842361 0.1188177
# 2 b 65 24 3 5 97 0.6393668 -0.3611690 -0.8736386 -0.8248320 1.4202728
# 3 c 25 71 41 84 42 -1.1427812 0.7618541 -0.4802994 1.3001207 -0.4388942
# 4 d 74 89 89 34 24 0.3878036 0.8725581 0.8725581 -0.9048751 -1.2280448
# 5 e 18 37 27 92 30 -0.7749098 -0.1291516 -0.4690243 1.7401483 -0.3670625
# 6 f 49 20 36 3 43 1.0067737 -0.5462283 0.3106004 -1.4566088 0.6854630

I used the mtcars dataset as an exemple :
library(tidyverse)
mtcars %>% #the dataset
select(disp) %>% #disp is the row that we want to normalize just as an exemple
mutate(disp2=(disp-mean(disp))/sd(disp)) #disp2 is the name of the now normalized row

A dplyr solution, re-using #Migwell toy example (please provide a reproducible example in your question):
library(dplyr)
df = data.table(
Destination = c("Belgium", "Bulgaria", "Czechia"),
`Jan 2008` = sample(1:1000, size=3),
`Feb 2008` = sample(1:1000, size=3),
`Mar 2008` = sample(1:1000, size=3))
> df
Destination Jan 2008 Feb 2008 Mar 2008
1: Belgium 443 114 628
2: Bulgaria 755 801 493
3: Czechia 123 512 517
You can use:
df2 <- df %>% select(`Jan 2008`:`Mar 2008`) %>% mutate(normJan2008=(`Jan 2008`-rowMeans(.,na.rm=T))/apply(.,1,sd))
> df2
Jan 2008 Feb 2008 Mar 2008 normJan2008
1: 443 114 628 0.1843742
2: 755 801 493 0.4333577
3: 123 512 517 -1.1546299
And do this for every variable you need to normalize.

Related

Calculate Percentage by Group with multiple columns in R

I have several data frames with monthly data, I would like to find the percentage distribution for each product and for each month. I have problem with multiple columns with months. Currently, I can only get a percentage by group for one month.
data <- data.frame(group = rep(LETTERS[1:3], each = 4),
Product = letters[1:4],
January = sample(1:100,12),
February = sample(1:100,12))
data_new1 <- transform(data,
perc = ave(January,
group,
FUN = prop.table))
data_new1$perc<-round(data_new1$perc, 2)
> data_new1
group Product January February perc
1 A a 12 16 0.05
2 A b 73 75 0.32
3 A c 78 11 0.34
4 A d 65 35 0.29
5 B a 86 63 0.36
6 B b 33 71 0.14
7 B c 92 49 0.38
8 B d 30 60 0.12
9 C a 91 59 0.37
10 C b 31 45 0.12
11 C c 99 7 0.40
12 C d 28 50 0.11
tidyverse
library(dplyr)
data %>%
group_by(group) %>%
mutate(across(c("January", "February"), proportions, .names = "{.col}_perc")) %>%
ungroup()
# A tibble: 12 x 6
group Product January February January_perc February_perc
<chr> <chr> <int> <int> <dbl> <dbl>
1 A a 49 40 0.426 0.252
2 A b 1 3 0.00870 0.0189
3 A c 19 50 0.165 0.314
4 A d 46 66 0.4 0.415
5 B a 61 82 0.218 0.285
6 B b 88 51 0.314 0.177
7 B c 32 75 0.114 0.260
8 B d 99 80 0.354 0.278
9 C a 6 31 0.0397 0.373
10 C b 8 5 0.0530 0.0602
11 C c 92 20 0.609 0.241
12 C d 45 27 0.298 0.325
base
data <- data.frame(group = rep(LETTERS[1:3], each = 4),
Product = letters[1:4],
January = sample(1:100,12),
February = sample(1:100,12))
tmp <- sapply(c("January", "February"), function (x) ave(data[[x]], data$group, FUN = prop.table))
colnames(tmp) <- paste0(colnames(tmp), "_perc")
res <- cbind(data, tmp)
res
#> group Product January February January_perc February_perc
#> 1 A a 42 73 0.18260870 0.238562092
#> 2 A b 67 92 0.29130435 0.300653595
#> 3 A c 58 90 0.25217391 0.294117647
#> 4 A d 63 51 0.27391304 0.166666667
#> 5 B a 48 15 0.21621622 0.081521739
#> 6 B b 16 82 0.07207207 0.445652174
#> 7 B c 80 75 0.36036036 0.407608696
#> 8 B d 78 12 0.35135135 0.065217391
#> 9 C a 81 16 0.32793522 0.117647059
#> 10 C b 83 81 0.33603239 0.595588235
#> 11 C c 11 1 0.04453441 0.007352941
#> 12 C d 72 38 0.29149798 0.279411765
Created on 2021-12-20 by the reprex package (v2.0.1)
data.table
library(data.table)
COLS <- c("January", "February")
COLS_RES <- paste0(COLS, "_perc")
setDT(data)[, (COLS_RES) := lapply(.SD, proportions), by = group, .SDcol = COLS][]
These calculations are easier if your data is structured in a tidy way. In your case, January and February should probably be one single variable called month or something.
Example:
Underneath, I use tidyr::pivot_longer() to combine January and February into one column. Then I use the package dplyr to group the dataframe and calculate perc. I'm not using prop.table(), but I believe you just want the proportion of observation to the total of that group and month.
library(dplyr)
library(tidyr)
# To make the sampling underneath reproducable
set.seed(1)
data <- data.frame(
group = rep(LETTERS[1:3], each = 4),
Product = letters[1:4],
January = sample(1:100,12),
February = sample(1:100,12)
)
data %>%
pivot_longer(c(January, February), names_to = "month", values_to = "x") %>%
group_by(group, month) %>%
mutate(
perc = round(x/sum(x), 2)
)
I hope this is what you were looking for.
Another dplyr solution:
library(dplyr)
data %>%
group_by(group) %>%
mutate(across(c(2:5),
~./sum(.)*100, .names = "{.col}_pct"))
# A tibble: 12 × 10
# Groups: group [3]
group Product Jan Feb Mar May Jan_pct Feb_pct Mar_pct May_pct
<chr> <chr> <int> <int> <int> <int> <dbl> <dbl> <dbl> <dbl>
1 A a 14 14 95 50 8 18.4 44.4 20.9
2 A b 100 33 28 32 57.1 43.4 13.1 13.4
3 A c 11 16 13 95 6.29 21.1 6.07 39.7
4 A d 50 13 78 62 28.6 17.1 36.4 25.9
5 B a 29 42 72 13 22.0 33.9 20.3 7.07
6 B b 3 4 88 41 2.27 3.23 24.9 22.3
7 B c 30 68 94 86 22.7 54.8 26.6 46.7
8 B d 70 10 100 44 53.0 8.06 28.2 23.9
9 C a 4 88 45 84 3.96 43.6 24.2 30.7
10 C b 52 12 26 55 51.5 5.94 14.0 20.1
11 C c 26 20 23 57 25.7 9.90 12.4 20.8
12 C d 19 82 92 78 18.8 40.6 49.5 28.5
Data:
data <- data.frame(group = rep(LETTERS[1:3], each = 4),
Product = letters[1:4],
Jan = sample(1:100,12),
Feb = sample(1:100,12),
Mar = sample(1:100, 12),
May = sample(1:100, 12))

Subtract columns when column name is a year

I'm certain the answer to this is simple, but I can't figure it out.
I have a pivot table where the column headings are years.
# A tibble: 3 x 5
country 2012 2013 2014 2015
<chr> <dbl> <dbl> <dbl> <dbl>
USA 45 23 12 42
Canada 67 98 14 25
Mexico 89 104 78 3
I want to create a new column that calculate the difference between two other columns. Rather than recognize the year as a column heading, however, R takes the difference between the two years.
Below is a sample of my code. If I put " " around the years, I get an error: "x non-numeric argument to binary operator". Without " ", R creates a new column with the value -3, simply subtracting the years.
df %>%
pivot_wider(names_from = year, values_from = value) %>%
mutate(diff = 2012 - 2015)
How do I re-write this to get the following table:
# A tibble: 3 x 6
country 2012 2013 2014 2015 diff
<chr> <dbl> <dbl> <dbl> <dbl> <dbl>
USA 45 23 12 47 -2
Canada 67 98 14 25 42
Mexico 89 104 78 3 86
You may try
df %>%
pivot_wider(names_from = year, values_from = value) %>%
mutate(diff = .$'2012' - .$'2015')
with your data,
df <- read.table(text = "country 2012 2013 2014 2015
USA 45 23 12 42
Canada 67 98 14 25
Mexico 89 104 78 3
", header = T)
names(df) <- c("country", 2012, 2013, 2014, 2015 )
df %>%
mutate(diff = .$'2012' - .$'2015')
country 2012 2013 2014 2015 diff
1 USA 45 23 12 42 3
2 Canada 67 98 14 25 42
3 Mexico 89 104 78 3 86

Calculate area under the curve for time serie data

I want to calculate the area under the curve for the time points for each id and column. Any suggestions? Which R packages to use? Many thanks!
id <- rep(1:3,each=5)
time <- rep(c(10,20,30,40,50),3)
q1 <- sample(100,15, replace=T)
q2 <- sample(100,15, replace=T)
q3 <- sample(100,15, replace=T)
df <- data.frame(id,time,q1,q2,q3)
df
id time q1 q2 q3
1 10 38 55 38
1 20 46 29 88
1 30 16 28 97
1 40 37 20 81
1 50 59 27 42
2 10 82 81 54
2 20 45 3 23
2 30 82 67 59
2 40 27 3 42
2 50 45 71 45
3 10 39 8 29
3 20 12 6 90
3 30 92 11 7
3 40 52 8 37
3 50 81 57 80
Wanted output, something like this:
q1 q2 q3
1 area area area
2 area area area
3 area area area
library(tidyverse)
id <- rep(1:3,each=5)
time <- rep(c(10,20,30,40,50),3)
q1 <- sample(100,15, replace=T)
q2 <- sample(100,15, replace=T)
q3 <- sample(100,15, replace=T)
df <- data.frame(id,time,q1,q2,q3)
df %>%
arrange(time) %>%
pivot_longer(cols = c(q1, q2, q3)) -> longer_df
longer_df %>%
ggplot(aes(x = time, y = value, col = factor(id))) +
geom_line() +
geom_point() +
facet_wrap(. ~ name)
longer_df %>%
group_by(id, name) %>%
mutate(lag_value = lag(value),
midpoint_value = (value + lag_value)/2) %>%
summarize(area = 10*sum(midpoint_value, na.rm = T)) %>%
pivot_wider(values_from = area)
#> `summarise()` has grouped output by 'id'. You can override using the `.groups` argument.
#> # A tibble: 3 x 4
#> # Groups: id [3]
#> id q1 q2 q3
#> <int> <dbl> <dbl> <dbl>
#> 1 1 1960 1980 2075
#> 2 2 1025 2215 2180
#> 3 3 2105 1590 2110
Created on 2021-06-30 by the reprex package (v2.0.0)
Here I will use the trapz function to calculate the integral.
library(data.table)
library(caTools) # integrate with its trapz function
# data
df <- fread("id time q1 q2 q3
1 10 38 55 38
1 20 46 29 88
1 30 16 28 97
1 40 37 20 81
1 50 59 27 42
2 10 82 81 54
2 20 45 3 23
2 30 82 67 59
2 40 27 3 42
2 50 45 71 45
3 10 39 8 29
3 20 12 6 90
3 30 92 11 7
3 40 52 8 37
3 50 81 57 80")
# calculate the area with `trapz`
df[,lapply(.SD[,2:4], function(y) trapz(time,y)),by=id]
#> id q1 q2 q3
#> 1: 1 1475 1180 3060
#> 2: 2 2175 1490 1735
#> 3: 3 2160 575 1885
Created on 2021-06-30 by the reprex package (v2.0.0)

Simplify multiple rowSums looping through columns

I'm currently on R trying to create for a DF multiple columns with the sum of previous one. Imagine I got a DF like this:
df=
sep-2016 oct-2016 nov-2016 dec-2016 jan-2017
1 70 153 NA 28 19
2 57 68 73 118 16
3 29 NA 19 32 36
4 177 36 3 54 53
and I want to add at the end the sum of the rows previous of the month that I'm reporting so for October you end up with the sum of sep and oct, and for November you end up with the sum of sep, oct and november and end up with something like this:
df=
sep-2016 oct-2016 nov-2016 dec-2016 jan-2017 status-Oct2016 status-Nov 2016
1 70 153 NA 28 19 223 223
2 57 68 73 118 16 105 198
3 29 NA 19 32 36 29 48
4 177 36 3 54 53 213 93
I want to know a efficient way insted of writing a lots of lines of rowSums() and even if I can get the label on the iteration for each month would be amazing!
Thanks!
We can use lapply to loop through the columns to apply the rowSums.
dat2 <- as.data.frame(lapply(2:ncol(dat), function(i){
rowSums(dat[, 1:i], na.rm = TRUE)
}))
names(dat2) <- paste0("status-", names(dat[, -1]))
dat3 <- cbind(dat, dat2)
dat3
# sep-2016 oct-2016 nov-2016 dec-2016 jan-2017 status-oct-2016 status-nov-2016 status-dec-2016 status-jan-2017
# 1 70 153 NA 28 19 223 223 251 270
# 2 57 68 73 118 16 125 198 316 332
# 3 29 NA 19 32 36 29 48 80 116
# 4 177 36 3 54 53 213 216 270 323
DATA
dat <- read.table(text = " 'sep-2016' 'oct-2016' 'nov-2016' 'dec-2016' 'jan-2017'
1 70 153 NA 28 19
2 57 68 73 118 16
3 29 NA 19 32 36
4 177 36 3 54 53",
header = TRUE, stringsAsFactors = FALSE)
names(dat) <- c("sep-2016", "oct-2016", "nov-2016", "dec-2016", "jan-2017")
Honestly I have no idea why you would want your data in this format, but here is a tidyverse method of accomplishing it. It involves transforming the data to a tidy format before spreading it back out into your wide format. The key thing to note is that in a tidy format, where month is a variable in a single column instead of spread across multiple columns, you can simply use group_by(rowid) and cumsum to calculate all the values you want. The last few lines are constructing the status- column names and spreading the data back out into a wide format.
library(tidyverse)
df <- read_table2(
"sep-2016 oct-2016 nov-2016 dec-2016 jan-2017
70 153 NA 28 19
57 68 73 118 16
29 NA 19 32 36
177 36 3 54 53"
)
df %>%
rowid_to_column() %>%
gather("month", "value", -rowid) %>%
arrange(rowid) %>%
group_by(rowid) %>%
mutate(
value = replace_na(value, 0),
status = cumsum(value)
) %>%
gather("vartype", "number", value, status) %>%
mutate(colname = ifelse(vartype == "value", month, str_c("status-", month))) %>%
select(rowid, number, colname) %>%
spread(colname, number)
#> # A tibble: 4 x 11
#> # Groups: rowid [4]
#> rowid `dec-2016` `jan-2017` `nov-2016` `oct-2016` `sep-2016`
#> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 28.0 19.0 0 153 70.0
#> 2 2 118 16.0 73.0 68.0 57.0
#> 3 3 32.0 36.0 19.0 0 29.0
#> 4 4 54.0 53.0 3.00 36.0 177
#> # ... with 5 more variables: `status-dec-2016` <dbl>,
#> # `status-jan-2017` <dbl>, `status-nov-2016` <dbl>,
#> # `status-oct-2016` <dbl>, `status-sep-2016` <dbl>
Created on 2018-02-16 by the reprex package (v0.2.0).
A clean way to do it is by convert your data in a long format.
library(tibble)
library(tidyr)
library(dplyr)
your_data <- tribble(~"sep_2016", ~"oct_2016", ~"nov_2016", ~"dec_2016", ~"jan_2017",
70, 153, NA, 28, 19,
57, 68, 73, 118, 16,
29, NA, 19, 32, 36,
177, 36, 3, 54, 53)
You can change the format of your data.frame with gather from the tidyr package.
your_data_long <- your_data %>%
rowid_to_column() %>%
gather(key = month_year, value = the_value, -rowid)
head(your_data_long)
#> # A tibble: 6 x 3
#> rowid month_year the_value
#> <int> <chr> <dbl>
#> 1 1 sep_2016 70
#> 2 2 sep_2016 57
#> 3 3 sep_2016 29
#> 4 4 sep_2016 177
#> 5 1 oct_2016 153
#> 6 2 oct_2016 68
Once your data.frame is in a long format. You can compute cumulative sum with cumsumand dplyrfunctions mutate and group_by.
result <- your_data_long %>%
group_by(rowid) %>%
mutate(cumulative_value = cumsum(the_value))
result
#> # A tibble: 20 x 4
#> # Groups: rowid [4]
#> rowid month_year the_value cumulative_value
#> <int> <chr> <dbl> <dbl>
#> 1 1 sep_2016 70 70
#> 2 2 sep_2016 57 57
#> 3 3 sep_2016 29 29
#> 4 4 sep_2016 177 177
#> 5 1 oct_2016 153 223
#> 6 2 oct_2016 68 125
#> 7 3 oct_2016 NA NA
#> 8 4 oct_2016 36 213
#> 9 1 nov_2016 NA NA
#> 10 2 nov_2016 73 198
#> 11 3 nov_2016 19 NA
#> 12 4 nov_2016 3 216
#> 13 1 dec_2016 28 NA
#> 14 2 dec_2016 118 316
#> 15 3 dec_2016 32 NA
#> 16 4 dec_2016 54 270
#> 17 1 jan_2017 19 NA
#> 18 2 jan_2017 16 332
#> 19 3 jan_2017 36 NA
#> 20 4 jan_2017 53 323
If you want to retrieve the starting form, you can do it with spread.
My preferred solution would be:
# library(matrixStats)
DF <- as.matrix(df)
DF[is.na(DF)] <- 0
RES <- matrixStats::rowCumsums(DF)
colnames(RES) <- paste0("status-", colnames(DF))
cbind.data.frame(df, RES)
This is closest to what you are looking for with the rowSums.
One option could be using spread and gather function from tidyverse.
Note: The status column has been added even for the 1st month. And the status columns are not in order but values are correct.
The approach is:
# Data
df <- read.table(text = "sep-2016 oct-2016 nov-2016 dec-2016 jan-2017
70 153 NA 28 19
57 68 73 118 16
29 NA 19 32 36
177 36 3 54 53", header = T, stringsAsFactors = F)
library(tidyverse)
# Just add an row number as sl
df <- df %>% mutate(sl = row_number())
#Calculate the cumulative sum after gathering and arranging by date
mod_df <- df %>%
gather(key, value, -sl) %>%
mutate(key = as.Date(paste("01",key, sep="."), format="%d.%b.%Y")) %>%
arrange(sl, key) %>%
group_by(sl) %>%
mutate(status = cumsum(ifelse(is.na(value),0L,value) )) %>%
select(-value) %>%
mutate(key = paste("status",as.character(key, format="%b.%Y"))) %>%
spread(key, status)
# Finally join cumulative calculated sum columns with original df and then
# remove sl column
inner_join(df, mod_df, by = "sl") %>% select(-sl)
# sep.2016 oct.2016 nov.2016 dec.2016 jan.2017 status Dec.2016 status Jan.2017 status Nov.2016 status Oct.2016 status Sep.2016
#1 70 153 NA 28 19 251 270 223 223 70
#2 57 68 73 118 16 316 332 198 125 57
#3 29 NA 19 32 36 80 116 48 29 29
#4 177 36 3 54 53 270 323 216 213 177
Another base solution where we build a matrix accumulating the row sums :
status <- setNames(
as.data.frame(t(apply(dat,1,function(x) Reduce(sum,'[<-'(x,is.na(x),0),accumulate = TRUE)))),
paste0("status-",names(dat)))
status
# status-sep-2016 status-oct-2016 status-nov-2016 status-dec-2016 status-jan-2017
# 1 70 223 223 251 270
# 2 57 125 198 316 332
# 3 29 29 48 80 116
# 4 177 213 216 270 323
Then bind it to your original data if needed :
cbind(dat,status[-1])

Display duplicate records in data.frame and omit single ones

I have been struggling with how to select ONLY duplicated rows of data.frame in R.
For Instance, my data.frame is:
age=18:29
height=c(76.1,77,78.1,78.2,78.8,79.7,79.9,81.1,81.2,81.8,82.8,83.5)
Names=c("John","John","John", "Harry", "Paul", "Paul", "Paul", "Khan", "Khan", "Khan", "Sam", "Joe")
village <- data.frame(Names, age, height)
Names age height
John 18 76.1
John 19 77.0
John 20 78.1
Harry 21 78.2
Paul 22 78.8
Paul 23 79.7
Paul 24 79.9
Khan 25 81.1
Khan 26 81.2
Khan 27 81.8
Sam 28 82.8
Joe 29 83.5
I want to see the result as following:
Names age height
John 18 76.1
John 19 77.0
John 20 78.1
Paul 22 78.8
Paul 23 79.7
Paul 24 79.9
Khan 25 81.1
Khan 26 81.2
Khan 27 81.8
Thanks for your time...
A solution using duplicated twice:
village[duplicated(village$Names) | duplicated(village$Names, fromLast = TRUE), ]
Names age height
1 John 18 76.1
2 John 19 77.0
3 John 20 78.1
5 Paul 22 78.8
6 Paul 23 79.7
7 Paul 24 79.9
8 Khan 25 81.1
9 Khan 26 81.2
10 Khan 27 81.8
An alternative solution with by:
village[unlist(by(seq(nrow(village)), village$Names,
function(x) if(length(x)-1) x)), ]
I find #Sven's answer using duplicated the "tidiest", but you can also do this many other ways. Here are two more:
Use table() and subset by matching the names where the tabulation is > 1 with the names present in the first column:
village[village$Names %in% names(which(table(village$Names) > 1)), ]
Use ave() to "tabulate" in a little different manner, but subset in the same way:
village[with(village, ave(as.numeric(Names), Names, FUN = length) > 1), ]
Alternatively, you can use grouping and summary in a dplyr pipeline.
It's more lines of code and maybe more costly in compute. But, the advantage is that you can find duplicate rows by a composite key of multiple columns, rather than only duplicates from within one column.
library(tidyverse)
a <- c(8, 18, 19, 19, 19, 20, 30, 32, 32)
b <- c(1950, 1965, 1981, 1971, 1981, 1999, 1969, 1994, 1999)
c <- c(1, 2, 3, 4, 5, 6, 7, 8, 9)
df <- data.frame(a, b, c)
df
# Description:df[,3] [9 × 3]
# a
# <dbl>
# b
# <dbl>
# c
# <dbl>
# 8 1950 1
# 18 1965 2
# 19 1981 3
# 19 1971 4
# 19 1981 5
# 20 1999 6
# 30 1969 7
# 32 1994 8
# 32 1999 9
# 9 rows
df[duplicated(df$a) | duplicated(df$a, fromLast = T), ]
# Description:df[,3] [5 × 3]
#
#
# a
# <dbl>
# b
# <dbl>
# c
# <dbl>
# 3 19 1981 3
# 4 19 1971 4
# 5 19 1981 5
# 8 32 1994 8
# 9 32 1999 9
# 5 rows
df[duplicated(df$a, df$b) | duplicated(df$a, df$b, fromLast = T), ]
# Description:df[,3] [5 × 3]
#
#
# a
# <dbl>
# b
# <dbl>
# c
# <dbl>
# 3 19 1981 3
# 4 19 1971 4
# 5 19 1981 5
# 8 32 1994 8
# 9 32 1999 9
# 5 rows
df %>%
group_by(a, b) %>%
summarise(a = a, b = b, c = c, n = n()) %>%
subset(n > 1) %>%
select(a, b, c)
#
# A tibble:2 x 3
# Groups:a, b [1]
# a
# <dbl>
# b
# <dbl>
# c
# <dbl>
# 19 1981 3
# 19 1981 5
# 2 rows
df[duplicated(df, incomparables = c(c)), ]
# Error: argument 'incomparables != FALSE' is not used (yet)
# This error occurs even with no libraries loaded.
I may be missing something in the way duplicated() is used in brackets, but I couldn't figure it out.
Also, dplyr returns a tibble, dropping the index, which may be a drawback for you.
I came up with a solution using nested sapply:
> village_dups =
village[unique(unlist(which(sapply(sapply(village$Names,function(x)
which(village$Names==x)),function(y) length(y)) > 1))),]
> village_dups
Names age height
1 John 18 76.1
2 John 19 77.0
3 John 20 78.1
5 Paul 22 78.8
6 Paul 23 79.7
7 Paul 24 79.9
8 Khan 25 81.1
9 Khan 26 81.2
10 Khan 27 81.8

Resources