Replace a string from lookuptable in R - r

I have a txt file with a list:
name
Test_123
run_456
Test_789
I have another lookuptable that contains the "ID" and gives me a "Plate"
ID plate
123 xxx
456 zzz
789 bbb
Would love to get here
Test_xxx
run_zzz
Test_bbb
My current code does not work entirely.
Either getting <NA> as I guess it looks for values and not for a string or errors.
Thanks so much for your help!
B

A tidyverse way to do this would be:
library(tidyverse)
df1 %>%
separate(name, c("name", "ID"), convert=TRUE) %>%
left_join(df2, by="ID") %>%
mutate(new_name = paste(name, plate, sep="_"))
Using:
df1 <- structure(list(name = c("Test_123", "run_456", "Test_789")),
.Names = "name", class = "data.frame", row.names = c(NA, -3L))
df2 <- structure(list(ID = c(123L, 456L, 789L), plate = c("xxx", "zzz",
"bbb")), .Names = c("ID", "plate"), class = "data.frame", row.names = c(NA,
-3L))
Note that:
separate(..., convert=TRUE) use some heuristics to convert character into integer. You can otherwise do this manually: mutate(ID=as.integer(ID))
You could use unite() (which does the opposite of separate()) instead of mutate(new_name = paste(name, plate, sep="_")), which would also remove the previous columns

An option would be gsubfn
library(gsubfn)
gsubfn("(\\d+)", setNames(as.list(df2$plate), df2$ID), df1$name)
#[1] "Test_xxx" "run_zzz" "Test_bbb"
data
df1 <- structure(list(name = c("Test_123", "run_456", "Test_789")),
.Names = "name", class = "data.frame", row.names = c(NA, -3L))
df2 <- structure(list(ID = c(123L, 456L, 789L), plate = c("xxx", "zzz",
"bbb")), .Names = c("ID", "plate"), class = "data.frame", row.names = c(NA,
-3L))

For a base R option, you could add a new column to your first data frame with the exact join data:
df1$ID <- sub(".*_(?=[0-9]+)", "", df1$name, perl=TRUE)
df1$start <- sub("_[0-9]+", "", df1$name)
Then, use merge:
result <- merge(df1, df2, by="ID")
And finally create your desired output column:
result$out <- paste0(result$start, "_", result$plate)
result$out
[1] "Test_xxx" "run_zzz" "Test_bbb"
Data:
df1 <- data.frame(name=c("Test_123", "run_456", "Test_789"), stringsAsFactors=FALSE)
df2 <- data.frame(ID=c("123", "456", "789"),
plate=c("xxx", "zzz", "bbb"), stringsAsFactors=FALSE)
Demo

Related

use dplyr to get list items from dataframe in R

I have a dataframe being returned from Microsoft365R:
SKA_student <- structure(list(name = "Computing SKA 2021-22.xlsx", size = 22266L,
lastModifiedBy =
structure(list(user =
structure(list(email = "my#email.com",
id = "8ae50289-d7af-4779-91dc-e4638421f422",
displayName = "Name, My"), class = "data.frame", row.names = c(NA, -1L))),
class = "data.frame", row.names = c(NA, -1L)),
fileSystemInfo = structure(list(
createdDateTime = "2021-09-08T16:03:38Z",
lastModifiedDateTime = "2021-09-16T00:09:04Z"), class = "data.frame", row.names = c(NA,-1L))), row.names = c(NA, -1L), class = "data.frame")
I can return all the lastModifiedBy data through:
SKA_student %>% select(lastModifiedBy)
lastModifiedBy.user.email lastModifiedBy.user.id lastModifiedBy.user.displayName
1 my#email.com 8ae50289-d7af-4779-91dc-e4638421f422 Name, My
But if I want a specific item in the lastModifiedBy list, it doesn't work, e.g.:
SKA_student %>% select(lastModifiedBy.user.email)
Error: Can't subset columns that don't exist.
x Column `lastModifiedBy.user.email` doesn't exist.
I can get this working through base, but would really like a dplyr answer
This function allows you to flatten all the list columns (I found this ages ago on SO but can't find the original post for credit)
SO_flat_cols <- function(data) {
ListCols <- sapply(data, is.list)
cbind(data[!ListCols], t(apply(data[ListCols], 1, unlist)))
}
Then you can select as you like.
SO_flat_cols (SKA_student) %>%
select(lastModifiedBy.user.email)
Alternatively you can get to the end by recursively pulling the lists
SKA_student %>%
pull(lastModifiedBy) %>%
pull(user) %>%
select(email)
You could use
library(dplyr)
library(tidyr)
SKA_student %>%
unnest_wider(lastModifiedBy) %>%
select(email)
This returns
# A tibble: 1 x 1
email
<chr>
1 my#email.com

How to check if values in one dataframe exist in another dataframe in R?

Suppose we have a data frame like this:
id reply user_name
1 NA John
2 NA Amazon
3 NA Bob
And another data frame like this:
name organisation
John Amazon
Pat Apple
Is there a way to fill in the reply column in the first data frame with 'True' or 'False' if the values in column 3 match either columns 1 or 2 in the second data frame? So for example, since John and Amazon from the second data frame exist in the first data frame, I want the first data frame to update as so:
id reply user_name
1 True John
2 True Amazon
3 False Bob
Try this using %in% and a vector for all values:
#Code
df1$reply <- df1$user_name %in% c(df2$name,df2$organisation)
Output:
df1
id reply user_name
1 1 TRUE John
2 2 TRUE Amazon
3 3 FALSE Bob
Some data used:
#Data1
df1 <- structure(list(id = 1:3, reply = c(NA, NA, NA), user_name = c("John",
"Amazon", "Bob")), class = "data.frame", row.names = c(NA, -3L
))
#Data2
df2 <- structure(list(name = c("John", "Pat"), organisation = c("Amazon",
"Apple")), class = "data.frame", row.names = c(NA, -2L))
We can use %in% in base R
df1$reply <- df1$user_name %in% unlist(df2)
If we want to change the format of the logical to character string
df1$reply <- sub("^(.)(.*)", "\\1\\L\\2", df1$reply, perl = TRUE)
df1$reply
#[1] "True" "True" "False"
data
df1 <- structure(list(id = 1:3, reply = c(NA, NA, NA), user_name = c("John",
"Amazon", "Bob")), class = "data.frame", row.names = c(NA, -3L
))
df2 <- structure(list(name = c("John", "Pat"), organisation = c("Amazon",
"Apple")), class = "data.frame", row.names = c(NA, -2L))
Here's how you can get the exact output you're looking for with 3 lines of code!
df1 <- data.frame(id = 1:3, reply = NA, user.name = c("John", "Amazon", "Bob"), stringsAsFactors = F)
df2 <- data.frame(id = 1:2, name = c("John", "Pat"), organisation = c("Amazon", "Apple"), stringsAsFactors = F)
df1$reply <- df1$user.name %in% unlist(df2) %>% as.character() %>% str_to_title()
Output
id reply user.name
1 True John
2 True Amazon
3 False Bob
You will need the packages dplyr, magrittr, and stringr, which I highly recommend for data wrangling of all kinds.
Building off the first answer, you can also solve this in a tidy way too.
#Building your dataframes
df1 <- data.frame(id = 1:3, reply = NA, user.name = c("John", "Amazon", "Bob"), stringsAsFactors = F)
df2 <- data.frame(id = 1:2, name = c("John", "Pat"), organisation = c("Amazon", "Apple"), stringsAsFactors = F)
df1 %>%
mutate(reply = user.name %in% c(df2$name, df2$organisation))
I like personally the tidy solution because then you can easily pipe through the result to get more insights--for instance, if you want to know how many people replied, that just takes one more line:
df1 %>%
mutate(reply = user.name %in% c(df2$name, df2$organisation)) %>%
summarize(reply_sum = sum(reply))

How to select one value of a data.frame within a list column with R?

I have a data.frame that contains a type column. The list contains a 1x3 data.frame. I only want one value from this list. Thus will flatten my data.frame so I can write out a csv.
How do I select one item from the nested data.frame (see the 2nd column)?
Here's the nested col. I'd provide the data but cannot flatten to write_csv.
result of dput:
structure(list(id = c("1386707", "1386700", "1386462", "1386340",
"1386246", "1386300"), fields.created = c("2020-05-07T02:09:27.000-0700",
"2020-05-07T01:20:11.000-0700", "2020-05-06T21:38:14.000-0700",
"2020-05-06T07:19:44.000-0700", "2020-05-06T06:11:43.000-0700",
"2020-05-06T02:26:44.000-0700"), fields.customfield_10303 = c(NA,
NA, 3, 3, NA, NA), fields.customfield_28100 = list(NULL, structure(list(
self = ".../rest/api/2/customFieldOption/76412",
value = "New Feature", id = "76412"), .Names = c("self",
"value", "id"), class = "data.frame", row.names = 1L), structure(list(
self = ".../rest/api/2/customFieldOption/76414",
value = "Technical Debt", id = "76414"), .Names = c("self",
"value", "id"), class = "data.frame", row.names = 1L), NULL,
structure(list(self = ".../rest/api/2/customFieldOption/76411",
value = "Maintenance", id = "76411"), .Names = c("self",
"value", "id"), class = "data.frame", row.names = 1L), structure(list(
self = ".../rest/api/2/customFieldOption/76412",
value = "New Feature", id = "76412"), .Names = c("self",
"value", "id"), class = "data.frame", row.names = 1L))), row.names = c(NA,
6L), class = "data.frame", .Names = c("id", "fields.created",
"fields.customfield_10303", "fields.customfield_28100"))
I found a way to do this.
First, instead of changing the data, I added a column with mutate. Then, directly selected the same column from all nested lists. Then, I converted the list column into a vector. Finally, I cleaned it up by removing the other columns.
It seems to work. I don't know yet how it will handle multiple rows within the nested df.
dat <- sample_dat %>%
mutate(cats = sapply(nested_col, `[[`, 2)) %>%
mutate(categories = sapply(cats, toString)) %>%
select(-nested_col, -cats)
Related
How to directly select the same column from all nested lists within a list?
r-convert list column into character vector where lists are characters
library(dplyr)
library(tidyr)
df <- tibble(Group=c("A","A","B","C","D","D"),
Batman=1:6,
Superman=c("red","blue","orange","red","blue","red"))
nested <- df %>%
nest(data=-Group)
unnested <- nested %>%
unnest(data)
Nesting and unnesting data with tidyr
library(purrr)
nested %>%
mutate(data=map(data,~select(.x,2))) %>%
unnest(data)
select with purrr, but lapply as you've done is fine, it's just for aesthetics ;)

Convert days to calendar dates within a data frame in R

I have a dataframe like
ID |TRTSDT| TRTEDT
101|17952 | 18037
102|17956 | 18041
How can i convert the days into Date format...Thank you
Try
df1[-1] <- lapply(df1[-1], as.Date, origin='1970-01-01')
data
df1 <- structure(list(ID = 101:102, TRTSDT = c(17952L, 17956L),
TRTEDT = c(18037L,
18041L)), .Names = c("ID", "TRTSDT", "TRTEDT"), class = "data.frame",
row.names = c(NA, -2L))

text cleaning in R

I have a single column in R that looks like this:
Path Column
ag.1.4->ao.5.5->iv.9.12->ag.4.35
ao.11.234->iv.345.455.1.2->ag.9.531
I want to transform this into:
Path Column
ag->ao->iv->ag
ao->iv->ag
How can I do this?
Thank you
Here is my full dput from my data:
structure(list(Rank = c(10394749L, 36749879L), Count = c(1L,
1L), Percent = c(0.001011122, 0.001011122), Path = c("ao.legacy payment.not_completed->ao.legacy payment.not_completed->ao.legacy payment.completed",
"ao.legacy payment.not_completed->agent.payment.completed")), .Names = c("Rank",
"Count", "Percent", "Path"), class = "data.frame", row.names = c(NA,
-2L))
You could use gsub to match the . and numbers following the . (\\.[0-9]+) and replace it with ''.
df1$Path.Column <- gsub('\\.[0-9]+', '', df1$Path.Column)
df1
# Path.Column
#1 ag -> ao -> iv -> ag
#2 ao -> iv -> ag
Update
For the new dataset df2
gsub('\\.[^->]+(?=(->|\\b))', '', df2$Path, perl=TRUE)
#[1] "ao->ao->ao" "ao->agent"
and for the string showed in the OP's post
str2 <- c('ag.1.4->ao.5.5->iv.9.12->ag.4.35',
'ao.11.234->iv.345.455.1.2->ag.9.531')
gsub('\\.[^->]+(?=(->|\\b))', '', str2, perl=TRUE)
#[1] "ag->ao->iv->ag" "ao->iv->ag"
data
df1 <- structure(list(Path.Column = c("ag.1 -> ao.5 -> iv.9 -> ag.4",
"ao.11 -> iv.345 -> ag.9")), .Names = "Path.Column",
class = "data.frame", row.names = c(NA, -2L))
df2 <- structure(list(Rank = c(10394749L, 36749879L), Count = c(1L,
1L), Percent = c(0.001011122, 0.001011122),
Path = c("ao.legacy payment.not_completed->ao.legacy payment.not_completed->ao.legacy payment.completed",
"ao.legacy payment.not_completed->agent.payment.completed")),
.Names = c("Rank", "Count", "Percent", "Path"), class = "data.frame",
row.names = c(NA, -2L))
It may be easeir to split the strings on '->' and process the substrings separately
# split the stirngs into parts
subStrings <- strsplit(df$Path,'->')
# remove eveything after **first** the dot
subStrings<- lapply(subStrings,
function(x)gsub('\\..*','',x))
# paste them back together.
sapply(subStrings,paste0,collapse="->")
#> "ao->ao->ao" "ao->agent"
or
# split the stirngs into parts
subStrings <- strsplit(df$Path,'->')
# remove the parts of the identifiers after the dot
subStrings<- lapply(subStrings,
function(x)gsub('\\.[^ \t]*','',x))
# paste them back together.
sapply(subStrings,paste0,collapse="->")
#> "ao payment->ao payment->ao payment" "ao payment->agent"

Resources