distinct drops columns after group_by - r

After doing a group_by I can't get distinct values unless I change the object back to a data frame.
library(dplyr)
x <- data.frame(A = c(1,1,2,2,3,3), B = c(1,2,3,4,5,6), C = c(6,6,6,5,5,5))
y <- x %>% group_by(A) %>% transmute(B = mean(B), C = mean(C))
y
distinct(y)
distinct(as.data.frame(y))
This behaviour seems to have changed after a recent dplyr release (I have dplyr_0.5.0) as I'm sure my code used to work. The question is, is this a bug or by design? If by design, I need to change a bunch of code. Thanks!

try
library(dplyr)
x <- data.frame(A = c(1,1,2,2,3,3), B = c(1,2,3,4,5,6), C = c(6,6,6,5,5,5))
y <- x %>% group_by(A) %>% transmute(B = mean(B), C = mean(C)) %>% ungroup()
y
distinct(y)
distinct(as.data.frame(y))
note the ungroup()

Related

Group dataframe row and column wise based on other dataframe?

I have a dataframe that I would like to group in both directions, first rowise and columnwise after. The first part worked well, but I am stuck with the second one. I would appreciate any help or advice for a solution that does both steps at the same time.
This is the dataframe:
df1 <- data.frame(
ID = c(rep(1,5),rep(2,5)),
ID2 = rep(c("A","B","C","D","E"),2),
A = rnorm(10,20,1),
B = rnorm(10,50,1),
C = rnorm(10,10,1),
D = rnorm(10,15,1),
E = rnorm(10,5,1)
)
This is the second dataframe, which holds the "recipe" for grouping:
df2 <- data.frame (
Group_1 = c("B","C"),
Group_2 = c("D","A"),
Group_3 = ("E"), stringsAsFactors = FALSE)
Rowise grouping:
df1_grouped<-bind_cols(df1[1:2], map_df(df2, ~rowSums(df1[unique(.x)])))
Now i would like to apply the same grouping to the ID2 column and sum the values in the other columns. My idea was to mutate a another column (e.g. "group", which contains the name of the final group of ID2. After this i can use group_by() and summarise() to calculate the sum for each. However, I can't figure out an automated way to do it
bind_cols(df1_grouped,
#add group label
data.frame(
group = rep(c("Group_2","Group_1","Group_1","Group_2","Group_3"),2))) %>%
#remove temporary label column and make ID a character column
mutate(ID2=group,
ID=as.character(ID))%>%
select(-group) %>%
#summarise
group_by(ID,ID2)%>%
summarise_if(is.numeric, sum, na.rm = TRUE)
This is the final table I need, but I had to manually assign the groups, which is impossible for big datasets
I will offer such a solution
library(tidyverse)
set.seed(1)
df1 <- data.frame(
ID = c(rep(1,5),rep(2,5)),
ID2 = rep(c("A","B","C","D","E"),2),
A = rnorm(10,20,1),
B = rnorm(10,50,1),
C = rnorm(10,10,1),
D = rnorm(10,15,1),
E = rnorm(10,5,1)
)
df2 <- data.frame (
Group_1 = c("B","C"),
Group_2 = c("D","A"),
Group_3 = ("E"), stringsAsFactors = FALSE)
df2 <- df2 %>% pivot_longer(everything())
df1 %>%
pivot_longer(-c(ID, ID2)) %>%
mutate(gr_r = df2$name[match(ID2, table = df2$value)],
gr_c = df2$name[match(name, table = df2$value)]) %>%
arrange(ID, gr_r, gr_c) %>%
pivot_wider(c(ID, gr_r), names_from = gr_c, values_from = value, values_fn = list(value = sum))

Calculation on every pair from grouped data.frame

My question is about performing a calculation between each pair of groups in a data.frame, I'd like it to be more vectorized.
I have a data.frame that has a consists of the following columns: Location , Sample , Var1, and Var2. I'd like to find the closet match for each Sample for each pair of Locations for both Var1 and Var2.
I can accomplish this for one pair of locations as such:
df0 <- data.frame(Location = rep(c("A", "B", "C"), each =30),
Sample = rep(c(1:30), times =3),
Var1 = sample(1:25, 90, replace =T),
Var2 = sample(1:25, 90, replace=T))
df00 <- data.frame(Location = rep(c("A", "B", "C"), each =30),
Sample = rep(c(31:60), times =3),
Var1 = sample(1:100, 90, replace =T),
Var2 = sample(1:100, 90, replace=T))
df000 <- rbind(df0, df00)
df <- sample_n(df000, 100) # data
dfl <- df %>% gather(VAR, value, 3:4)
df1 <- dfl %>% filter(Location == "A")
df2 <- dfl %>% filter(Location == "B")
df3 <- merge(df1, df2, by = c("VAR"), all.x = TRUE, allow.cartesian=TRUE)
df3 <- df3 %>% mutate(DIFF = abs(value.x-value.y))
result <- df3 %>% group_by(VAR, Sample.x) %>% top_n(-1, DIFF)
I tried other possibilities such as using dplyr::spread but could not avoid the "Error: Duplicate identifiers for rows" or columns half filled with NA.
Is there a more clean and automated way to do this for each possible group pair? I'd like to avoid the manual subset and merge routine for each pair.
One option would be to create the pairwise combination of 'Location' with combn and then do the other steps as in the OP's code
library(tidyverse)
df %>%
# get the unique elements of Location
distinct(Location) %>%
# pull the column as a vector
pull %>%
# it is factor, so convert it to character
as.character %>%
# get the pairwise combinations in a list
combn(m = 2, simplify = FALSE) %>%
# loop through the list with map and do the full_join
# with the long format data df1
map(~ full_join(df1 %>%
filter(Location == first(.x)),
df1 %>%
filter(Location == last(.x)), by = "VAR") %>%
# create a column of absolute difference
mutate(DIFF = abs(value.x - value.y)) %>%
# grouped by VAR, Sample.x
group_by(VAR, Sample.x) %>%
# apply the top_n with wt as DIFF
top_n(-1, DIFF))
Also, as the OP mentioned about automatically picking up instead of doing double filter (not clear about the expected output though)
df %>%
distinct(Location) %>%
pull %>%
as.character %>%
combn(m = 2, simplify = FALSE) %>%
map(~ df1 %>%
# change here i.e. filter both the Locations
filter(Location %in% .x) %>%
# spread it to wide format
spread(Location, value, fill = 0) %>%
# create the DIFF column by taking the differene
mutate(DIFF = abs(!! rlang::sym(first(.x)) -
!! rlang::sym(last(.x)))) %>%
group_by(VAR, Sample) %>%
top_n(-1, DIFF))

Remove points that are repeated consecutively more than n times

I have a large data set with X and Y points. I want to animate it, so I want to remove the points that are fixed locations.I would like to remove rows where the same X and Y is repeated more than n times.
So far I did this, is there a more elegant solution? Thanks!
uniques <- unique(data[c("Lat","Long")])
uniques$values = row.names(uniques)
uniques2 <- inner_join(data,uniques,by=c("Lat","Long"))
reps <- data.frame(unclass(rle(uniques2$values)))
delete <- as.character(reps$values[(reps$lengths)>10])
data2 <- uniques2[! uniques2$values %in% delete),]
Tidyverse-way would be
data2 <- data %>%
group_by(Long, Lat) %>%
filter(n() <= 10) %>%
ungroup()
Assuming you want to keep the first of the rows where x and y start repeating, you could try the following approach with the dplyr package:
library(dplyr)
# Example data
df <- data.frame(
x = c(rep(1, 5), 2:6, rep(7, 5)),
y = c(rep(9, 5), 2:6, rep(8, 5))
)
# Cut-off value
n <- 3
# Remove unwanted rows
new_df <- df %>%
mutate(same_as_prev = x == lag(x) & y == lag(y)) %>%
group_by(x, y, same_as_prev) %>%
mutate(consec_count = n()) %>%
filter(consec_count <= n & same_as_prev) %>%
ungroup()
Using data.table I will try this one line solution:
library(data.table)
data < as.data.table(data)[, count:=.N, by=.(Lat,Long)][count<n][,count:=NULL]
Best!

Superscripting a variable over another when building tables in R and knitr

I am trying to build a table, and one of my variables should have another variable superscriptet after it. I can find several related answers here on SO, but they all involve fixed values that need to be superscriptet, instead of vectors as in my case.
Also most examples involve plot legends and not tables like in my case (Although I don't think that makes much of a difference).
Example data:
library(tidyverse)
library(knitr)
df <- crossing(
X = seq(1:2),
Y = c("A", "B"))
df
# A tibble: 4 x 2
X Y
<int> <chr>
1 1 A
2 1 B
3 2 A
4 2 B
I would like to mutate a new variable that is just X with Y values superscriptet after it.
Here is what I have tried (Doesn't work):
df %>% mutate(
New = paste0(X, "^Y")) %>%
kable()
df %>% mutate(
New = paste0(X, ^{Y})) %>%
kable()
df %>% mutate(
New = paste0(X, bquote(^~{.Y}~))) %>%
kable()
Any help appreciated.
You could use tableHTML:
df <- data.frame(
X = seq(1:2),
Y = c("A", "B"))
library(dplyr)
library(tableHTML)
You can slightly modify X with the HTML tag <sup> to display Y as a superset:
df %>%
mutate(X = paste0(X, "<sup>", Y, "</sup>")) %>%
select(X) %>%
tableHTML(rownames = FALSE,
escape = FALSE,
widths = 50)
Edit
As pointed out by Steen, this also works with knitr:
df %>%
mutate(X = paste0(X, "<sup>", Y, "</sup>")) %>%
select(X) %>%
knitr::kable(escape = FALSE)
Is it for a pdf output?
Because in this case the following could work:
library(tidyverse)
library(knitr)
df <- crossing(
X = seq(1:2),
Y = c("A", "B"))
df %>% mutate(
New = paste0(X, "\\textsuperscript{", Y, "}")) %>%
kable(escape = FALSE)
Using escape = FALSE to add LaTeX inside the table.

working with paired data across groups in the tidyverse

I have multiple observations from each of a few groups and I'd like to make a matrix of QQ plots (or another type of plot), comparing each group to every other group.
Here's an example of what I'm talking about:
library(tidyverse)
set.seed(27599)
n <- 30
d <- data_frame(person = c(rep('Alice', n),
rep('Bob', n),
rep('Charlie', n),
rep('Danielle', n)),
score = c(rnorm(n = n),
rnorm(n = n, mean = 0.1),
rnorm(n = n, sd = 2),
rnorm(n = n, mean = 0.3, sd = 1.4)))
by_hand <- data_frame(a = sort(d$score[d$person == 'Alice']),
b = sort(d$score[d$person == 'Bob']),
c = sort(d$score[d$person == 'Charlie']),
d = sort(d$score[d$person == 'Danielle']))
pairs(x = by_hand,
lower.panel = function(x, y) { points(x, y); abline(0, 1);})
Here, I've manipulated the data by hand and used graphics::pairs() to make the plot. Can the same be done inside the tidyverse?
Here's what I've tried.
d %>%
group_by(person) %>%
mutate(score = sort(score)) %>%
glimpse()
This seems promising.
d %>%
group_by(person) %>%
mutate(score = sort(score)) %>%
spread(key = person, value = score)
This gives the 'duplicate identifiers' error.
Maybe reshape2 would be better to use here?
d %>%
group_by(person) %>%
mutate(score = sort(score)) %>%
dcast(formula = score ~ person)
This creates a data.frame with 120 rows, and most of the values (90 per person) are NA. How can I create a wide data.frame without introducing so many NA?
You need a variable that links the row position for each person. Try
by_tidyverse <- d %>%
group_by(person) %>%
mutate(rowID=1:n(),
score=sort(score)
) %>%
spread(key = person, value = score) %>%
select(-rowID)
pairs(x = by_tidyverse, lower.panel = function(x, y) { points(x, y); abline(0, 1);})

Resources