Reorder Rows Multiple Values - r

I am trying to arrange my current data set so that all the Visits are arranged for all the Individuals.
I tried the method suggested in this question and it works but only shows the values for the first individual.
Data:
structure(list(Individual = c("John", "John", "John", "Anna",
"Anna", "Anna", "Seth", "Seth", "Seth"), Visit = c("Last", "First",
"Review", "Last", "First", "Review", "Last", "First", "Review"
), Amount = c(25, 100, 75, 25, 100, 75, 25, 100, 75)), row.names = c(NA,
-9L), class = c("tbl_df", "tbl", "data.frame"))
Attempted code:
target <- c("First","Review","Last")
Visit <- Visit[match(target, Visit$Visit),]

You can use :
Visit[with(Visit, order(Individual, match(Visit, target))), ]
Or using dplyr :
library(dplyr)
df %>% arrange(Individual, match(Visit, target))
# Individual Visit Amount
# <chr> <chr> <dbl>
#1 Anna First 100
#2 Anna Review 75
#3 Anna Last 25
#4 John First 100
#5 John Review 75
#6 John Last 25
#7 Seth First 100
#8 Seth Review 75
#9 Seth Last 25

I think you need conversion of Visit field into a factor field with ordering.
target <- c("First","Review","Last")
df$Visit <- factor(df$Visit, levels = target, ordered = T)
dplyr::arrange(df, Individual, Visit)
> dplyr::arrange(df, Individual, Visit)
# A tibble: 9 x 3
Individual Visit Amount
<chr> <ord> <dbl>
1 Anna First 100
2 Anna Review 75
3 Anna Last 25
4 John First 100
5 John Review 75
6 John Last 25
7 Seth First 100
8 Seth Review 75
9 Seth Last 25
dput used
df <- structure(list(Individual = c("John", "John", "John", "Anna",
"Anna", "Anna", "Seth", "Seth", "Seth"), Visit = c("Last", "First",
"Review", "Last", "First", "Review", "Last", "First", "Review"
), Amount = c(25, 100, 75, 25, 100, 75, 25, 100, 75)), row.names = c(NA,
-9L), class = c("tbl_df", "tbl", "data.frame"))

Related

Transform data to long with grouped columns

For this week's tidytuesday challenge, for some reason, I am not able to group the column names in R which I was doing with pivot_longer function from tidyr previously. So, here is my code and I do not get it why it does throw an error and not give what I want.
library(tidyverse)
tuesdata <- tidytuesdayR::tt_load(2023, week = 7)
age_gaps <- tuesdata$age_gaps
df_long <- age_gaps %>%
pivot_longer(cols= actor_1_name:actor_2_name, names_to = "actornumber", values_to = "actorname") %>%
pivot_longer(cols= character_1_gender:character_2_gender, names_to = "gendernumber", values_to = "gender") %>%
pivot_longer(cols= actor_1_age:actor_2_age, names_to = "agenumber", values_to = "age") %>%
select(movie_name, release_year, director, age_difference, actorname, gender, age)
As seen from the code, the initial data has 1155 rows and after doing the quick data wrangling, I am expecting to get a data of 1155x2=2310 rows as I would like to merge the columns on actor names and their relevant information such as age and birthdate. Yet, the code does not give me the expected outcome and I am wondering why and how can I solve this problem. Thank you for your attention beforehand.
Example data (first 6 rows)
age_gaps <- structure(list(movie_name = c("Harold and Maude", "Venus", "The Quiet American",
"The Big Lebowski", "Beginners", "Poison Ivy"), release_year = c(1971,
2006, 2002, 1998, 2010, 1992), director = c("Hal Ashby", "Roger Michell",
"Phillip Noyce", "Joel Coen", "Mike Mills", "Katt Shea"), age_difference = c(52,
50, 49, 45, 43, 42), couple_number = c(1, 1, 1, 1, 1, 1), actor_1_name = c("Ruth Gordon",
"Peter O'Toole", "Michael Caine", "David Huddleston", "Christopher Plummer",
"Tom Skerritt"), actor_2_name = c("Bud Cort", "Jodie Whittaker",
"Do Thi Hai Yen", "Tara Reid", "Goran Visnjic", "Drew Barrymore"
), character_1_gender = c("woman", "man", "man", "man", "man",
"man"), character_2_gender = c("man", "woman", "woman", "woman",
"man", "woman"), actor_1_birthdate = structure(c(-26725, -13666,
-13442, -14351, -14629, -13278), class = "Date"), actor_2_birthdate = structure(c(-7948,
4536, 4656, 2137, 982, 1878), class = "Date"), actor_1_age = c(75,
74, 69, 68, 81, 59), actor_2_age = c(23, 24, 20, 23, 38, 17)), row.names = c(NA,
-6L), class = c("tbl_df", "tbl", "data.frame"))
You could set ".value" in names_to and supply one of names_sep or names_pattern to specify how the column names should be split.
library(tidyr)
age_gaps %>%
pivot_longer(actor_1_name:actor_2_age,
names_prefix = "(actor|character)_",
names_to = c("actor", ".value"),
names_sep = '_')
# A tibble: 12 × 10
movie_name release_year director age_difference couple_number actor name gender birthdate age
<chr> <dbl> <chr> <dbl> <dbl> <chr> <chr> <chr> <date> <dbl>
1 Harold and Maude 1971 Hal Ashby 52 1 1 Ruth Gordon woman 1896-10-30 75
2 Harold and Maude 1971 Hal Ashby 52 1 2 Bud Cort man 1948-03-29 23
3 Venus 2006 Roger Michell 50 1 1 Peter O'Toole man 1932-08-02 74
4 Venus 2006 Roger Michell 50 1 2 Jodie Whittaker woman 1982-06-03 24
5 The Quiet American 2002 Phillip Noyce 49 1 1 Michael Caine man 1933-03-14 69
6 The Quiet American 2002 Phillip Noyce 49 1 2 Do Thi Hai Yen woman 1982-10-01 20
7 The Big Lebowski 1998 Joel Coen 45 1 1 David Huddleston man 1930-09-17 68
8 The Big Lebowski 1998 Joel Coen 45 1 2 Tara Reid woman 1975-11-08 23
9 Beginners 2010 Mike Mills 43 1 1 Christopher Plummer man 1929-12-13 81
10 Beginners 2010 Mike Mills 43 1 2 Goran Visnjic man 1972-09-09 38
11 Poison Ivy 1992 Katt Shea 42 1 1 Tom Skerritt man 1933-08-25 59
12 Poison Ivy 1992 Katt Shea 42 1 2 Drew Barrymore woman 1975-02-22 17

Quicker way? Remove rows in book1, take row 4 values as column name, set some column name same as book2

Below is the first dataframe where I want to remove the first 3 rows:
book1 <- structure(list(Instructions..xyz = c("Note: abc", "", "Set1",
"id", "632592651", "633322173", "634703802", "634927873", "635812953",
"636004739", "636101211", "636157799", "636263106", "636752420"
), X = c("", "", "", "title", "asdf", "cat", "dog", "mouse",
"elephant", "goose", "rat", "mice", "kitty", "kitten"), X.1 = c("",
"", "", "hazard", "y", "y", "y", "n", "n", "y", "y", "n", "n",
"y"), X.2 = c("", "", "Set2", "id", "632592651", "633322173",
"634703802", "634927873", "635812953", "636004739", "636101211",
"636157799", "636263106", "636752420"), X.3 = c("", "", "", "title",
"asdf2", "cat2", "dog2", "mouse2", "elephant2", "goose2", "rat2",
"mice2", "kitty2", "kitten2"), X.4 = c("", "", "", "index", "0.664883807",
"0.20089779", "0.752228086", "0.124729276", "0.626285086", "0.134537909",
"0.612526768", "0.769622463", "0.682532524", "0.819015658")), class = "data.frame", row.names = c(NA,
-14L))
I did book1 <- book1[-c(1:3),] but I'm not sure how to make id, title, hazard, id, title, index as the column name instead of Instructions..xyz, etc. See image below for desired output
Then for the second dataframe,
book2 <- structure(list(identity = c(632592651L, 633322173L, 634703802L,
634927873L, 635812953L, 636004739L, 636101211L, 636157799L, 636263106L,
636752420L, 636809222L, 2004722036L, 2004894388L, 2005045755L,
2005535472L, 2005630542L, 2005788781L, 2005809679L, 2005838317L,
2005866692L), text = c("asdf_xyz", "cat", "dog", "mouse", "elephant",
"goose", "rat", "mice", "kitty", "kitten", "tiger_xyz", "lion",
"leopard", "ostrich", "kangaroo", "platypus", "fish", "reptile",
"mammals", "amphibians_xyz"), volume = c(1234L, 432L, 324L, 333L,
2223L, 412346L, 7456L, 3456L, 2345L, 2345L, 6L, 345L, 23L, 2L,
4778L, 234L, 8675L, 3459L, 8L, 9L)), class = "data.frame", row.names = c(NA,
-20L))
I then rename column 1 and 2 in book2 so that it matches that of book1 by names(book2)[1:2] <- c('id','title') where I can later do inner_join. The desired output is shown in the image below by
library(dplyr)
book1 %>%
inner_join(book2, by = c("id", "title"))
This is taking quite a few steps and wondering if there's a simplified version to this?
Something like this?
# split the data by columns
book2a <- book1[-(1:4), 1:3]
book2b <- book1[-(1:4), 4:6]
# take care of names
names(book2a) <- book1[4, 1:3, drop = TRUE]
names(book2b) <- book1[4, 4:6, drop = TRUE]
# book2b needs processing
book2b$title <- sub("2", "", book2b$title)
book2b$index <- as.numeric(book2b$index)
# join both data sets and clean-up
book2 <- merge(book2a, book2b, all = TRUE)
rm(book2a, book2b)
book2
#> id title hazard index
#> 1 632592651 asdf y 0.6648838
#> 2 633322173 cat y 0.2008978
#> 3 634703802 dog y 0.7522281
#> 4 634927873 mouse n 0.1247293
#> 5 635812953 elephant n 0.6262851
#> 6 636004739 goose y 0.1345379
#> 7 636101211 rat y 0.6125268
#> 8 636157799 mice n 0.7696225
#> 9 636263106 kitty n 0.6825325
#> 10 636752420 kitten y 0.8190157
Created on 2022-06-25 by the reprex package (v2.0.1)
Found the solution to the first question
library(janitor)
book1 <- row_to_names(dat=book1, row_number=4, remove_row = TRUE, remove_rows_above = TRUE)
I applied
names(book1)[4:5] <- c('id1','title1')
to obtain unique column name, then tried inner_join as proposed earlier but with error and found that book1$id is character where book2$id is int and so I did
book1$id <- as.integer(book1$id)
and finally it works with
library(tidyverse)
Yeah <- book1 %>%
inner_join(book2, by = c("id", "title"))
Output below:
id title hazard id1 title1 index volume
1 633322173 cat y 633322173 cat2 0.20089779 432
2 634703802 dog y 634703802 dog2 0.752228086 324
3 634927873 mouse n 634927873 mouse2 0.124729276 333
4 635812953 elephant n 635812953 elephant2 0.626285086 2223
5 636004739 goose y 636004739 goose2 0.134537909 412346
6 636101211 rat y 636101211 rat2 0.612526768 7456
7 636157799 mice n 636157799 mice2 0.769622463 3456
8 636263106 kitty n 636263106 kitty2 0.682532524 2345
9 636752420 kitten y 636752420 kitten2 0.819015658 2345
Still wondering if there's a quicker way?

Perform a series of mutations to columns in dataframe

I am trying to replace some text in my dataframe (a few rows given below)
> dput(Henry.longer[1:4,])
structure(list(N_l = c(4, 4, 4, 4), UG = c("100", "100", "100",
"100"), S = c(12, 12, 12, 12), Sample = c(NA, NA, NA, NA), EQ = c("Henry",
"Henry", "Henry", "Henry"), DF = c(0.798545454545455, 0.798545454545455,
0.798545454545455, 0.798545454545455), meow = c("Henry.Exterior.single",
"Multi", "Henry.Exterior.multi", "Henry.Interior.single"), Girder = c("Henry.Exterior.single",
"Henry.Interior.multi", "Henry.Exterior.multi", "Interior")), row.names = c(NA,
-4L), groups = structure(list(UG = "100", S = 12, .rows = list(
1:4)), row.names = c(NA, -1L), class = c("tbl_df", "tbl",
"data.frame"), .drop = FALSE), class = c("grouped_df", "tbl_df",
"tbl", "data.frame"))
I try to mutate the dataframe as:
Henry.longer <- Henry.longer %>%
mutate(Loading = str_replace(meow, "Henry.Exterior.single", "Single")) %>%
mutate(Loading = str_replace(meow, "Henry.Exterior.multi", "Multi")) %>%
mutate(Loading = str_replace(meow, "Henry.Interior.single", "Single")) %>%
mutate(Loading = str_replace(meow, "Henry.Interior.multi", "Multi")) %>%
mutate(Girder = str_replace(meow, "Henry.Exterior.multi", "Exterior")) %>%
mutate(Girder = str_replace(meow, "Henry.Exterior.single", "Exterior")) %>%
mutate(Girder = str_replace(meow, "Henry.Interior.multi", "Interior")) %>%
mutate(Girder = str_replace(meow, "Henry.Interior.single", "Interior")) %>%
select(-meow)
But for some reason the results does not get applied to all the rows and only:
N_l UG S Sample EQ DF Loading Girder
1 4 100 12 NA Henry 0.799 Henry.Exterior.single Henry.Exterior.single
2 4 100 12 NA Henry 0.799 Multi Henry.Interior.multi
3 4 100 12 NA Henry 0.799 Henry.Exterior.multi Henry.Exterior.multi
4 4 100 12 NA Henry 0.799 Henry.Interior.single Interior
I think we can use lookup vectors for this, if it's easy or safer to use static string lookups:
tr_vec <- c(Henry.Exterior.single = "Single", Henry.Exterior.multi = "Multi", Henry.Interior.single = "Single", Henry.Interior.multi = "Multi")
tr_vec2 <- c(Henry.Exterior.multi = "Exterior", Henry.Exterior.single = "Exterior", Henry.Interior.multi = "Interior", Henry.Interior.single = "Interior")
Henry.longer %>%
mutate(
Loading = coalesce(tr_vec[Loading], Loading),
Girder = coalesce(tr_vec2[Girder], Girder)
)
# # A tibble: 4 x 8
# # Groups: UG, S [1]
# N_l UG S Sample EQ DF Loading Girder
# <dbl> <chr> <dbl> <lgl> <chr> <dbl> <chr> <chr>
# 1 4 100 12 NA Henry 0.799 Single Exterior
# 2 4 100 12 NA Henry 0.799 Multi Interior
# 3 4 100 12 NA Henry 0.799 Multi Exterior
# 4 4 100 12 NA Henry 0.799 Single Interior
The advantage of RonakShah's regex solution is that it can very easily handle many of the types of substrings you appear to need. Regexes do carry a little risk, though, in that they may (unlikely in that answer, but) miss match.
Instead of using str_replace I guess it would be easier to extract what you want using regex.
library(dplyr)
Henry.longer %>%
mutate(Loading = sub('.*\\.', '', meow),
Girder = sub('.*\\.(\\w+)\\..*', '\\1', meow))
where
Loading - removes everything until last dot
Girder - extracts a word between two dots.
Oh boy, looks like you've got some answers here already but here's a super-simple one that uses stringr::str_extract:
Henry.longer <- Henry.longer %>%
mutate(Loading = str_extract(meow, "single|multi")) %>%
mutate(Girder = str_extract(meow, "Interior|Exterior"))
It's worth noting that the demo data has a weird entry for meow in one column, so it didn't run perfectly on my machine:

The fastest way to remove rows from a very big dataframe matching another dataframe

What is the fastest function in r to remove the rows in a dataframe if the same two first column is in another dataframe. For example, if the data frame A is as below(with more information columns):
NAME SURENAME
John Beer
Rose Pitt
Bob Kin
Charile Kind
Smith Red
Brad Tea
Kale Joe
Ana Bread
Lauren Old
Mike Karl
and B as below:
NAME SURENAME
Rose Pitt
Smith Red
Mike Karl
I want B to be removed from A to be like:
NAME SURENAME
John Beer
Bob Kin
Charile Kind
Brad Tea
Kale Joe
Ana Bread
Lauren Old
So in my case, A has 2 million rows (and 10 other columns) and B has 200,000 rows (all unique Name and Surnames).
Tested a benchmark filtering a data frame of 2 million rows by one with 200,000 rows, as indicated in the original post, where you can clearly see the speed of data.table relative to dplyr. Given the immense time dplyr functions took to run, particularly set_diff, I only ran each once.
rbenchmark::benchmark(
"dplyr_anti_join" = {
set.seed(1)
df <- data.frame(a = letters[runif(10000000, min = 1, max = 26)],
b = runif(100000000, 1, 200000))
indices <- data.frame(a = letters[runif(200000, min = 1, max = 26)],
b = 1:200000)
dplyr::anti_join(df, indices, by = c("a", "b"))
},
"dplyr_set_diff" = {
set.seed(1)
df <- data.frame(a = letters[runif(10000000, min = 1, max = 26)],
b = runif(100000000, 1, 200000))
indices <- data.frame(a = letters[runif(200000, min = 1, max = 26)],
b = 1:200000)
dplyr::setdiff(df, indices)
},
"dt" = {
set.seed(1)
library(data.table)
df <- data.table(a = letters[runif(10000000, min = 1, max = 26)],
b = runif(100000000, 1, 200000))
indices <- data.table(a = letters[runif(200000, min = 1, max = 26)],
b = 1:200000)
fsetdiff(df, indices)
},
replications = 1
)
#> test replications elapsed relative user.self sys.self user.child sys.child
#> 1 dplyr_anti_join 1 637.06 13.165 596.86 11.50 NA NA
#> 2 dplyr_set_diff 1 9981.93 206.281 320.67 4.66 NA NA
#> 3 dt 1 48.39 1.000 80.61 8.73 NA NA
Maybe you can try the code below using setdiff() from dplyr package, but you need to check its speed for large data frame (I am not sure about its performance then)
C <- dplyr::setdiff(A,B)
such that
> C
NAME SURENAME
1 John Beer
2 Bob Kin
3 Charile Kind
4 Brad Tea
5 Kale Joe
6 Ana Bread
7 Lauren Old
DATA
A <- structure(list(NAME = c("John", "Rose", "Bob", "Charile", "Smith",
"Brad", "Kale", "Ana", "Lauren", "Mike"), SURENAME = c("Beer",
"Pitt", "Kin", "Kind", "Red", "Tea", "Joe", "Bread", "Old", "Karl"
)), class = "data.frame", row.names = c(NA, -10L))
B <- structure(list(NAME = c("Rose", "Smith", "Mike"), SURENAME = c("Pitt",
"Red", "Karl")), class = "data.frame", row.names = c(NA, -3L))

How to append 2 data sets one below the other having slightly different column names?

Data set1:
ID Name Territory Sales
1 Richard NY 59
8 Sam California 44
Data set2:
Terr ID Name Comments
LA 5 Rick yes
MH 11 Oly no
I want final data set to have columns of 1st data set only and identify Territory is same as Terr and does not bring forward Comments column.
Final data should look like:
ID Name Territory Sales
1 Richard NY 59
8 Sam California 44
5 Rick LA NA
11 Oly MH NA
Thanks in advance
A possible solution:
# create a named vector with names from 'set2'
# with the positions of the matching columns in 'set1'
nms2 <- sort(unlist(sapply(names(set2), agrep, x = names(set1))))
# only keep the columns in 'set2' for which a match is found
# and give them the same names as in 'set1'
set2 <- setNames(set2[names(nms2)], names(set1[nms2]))
# bind the two dataset together
# option 1:
library(dplyr)
bind_rows(set1, set2)
# option 2:
library(data.table)
rbindlist(list(set1, set2), fill = TRUE)
which gives (dplyr-output shown):
ID Name Territory Sales
1 1 Richard NY 59
2 8 Sam California 44
3 5 Rick LA NA
4 11 Oly MH NA
Used data:
set1 <- structure(list(ID = c(1L, 8L),
Name = c("Richard", "Sam"),
Territory = c("NY", "California"),
Sales = c(59L, 44L)),
.Names = c("ID", "Name", "Territory", "Sales"), class = "data.frame", row.names = c(NA, -2L))
set2 <- structure(list(Terr = c("LA", "MH"),
ID = c(5L, 11L),
Name = c("Rick", "Oly"),
Comments = c("yes", "no")),
.Names = c("Terr", "ID", "Name", "Comments"), class = "data.frame", row.names = c(NA, -2L))

Resources