add numbers to specific observations - r

I have several IDs I am working with. I want to add a leading zero for values that have 1 integer after the dash in id. Here is sample data. I
id
2034-5
1023-12
1042-22
1231-9
I want this:
id
2034-05
1023-12
1042-22
1231-09
I tried this, but it's not working. Any advice?
x <-sprintf("%02d", df$id)

You could actually use sub here for a base R option:
df$id <- sub("-(\\d)$", "-0\\1", df$id)
df
id
1 2034-05
2 1023-12
3 1042-22
4 1231-09
Data:
df <- data.frame(id=c("2034-5", "1023-12", "1042-22", "1231-9"), stringsAsFactors=FALSE)

To use sprintf you have to separate out both the numbers, use sprintf on second number and then combine them again.
library(dplyr)
library(tidyr)
df %>%
separate(id, c('id1', 'id2')) %>%
mutate(id2 = sprintf('%02s', id2)) %>%
unite(id, id1, id2, sep = '-')
# id
#1 2034-05
#2 1023-12
#3 1042-22
#4 1231-09

An option with strsplit and sprintf from base R
df$id <- sapply(strsplit(df$id, "-"), function(x)
do.call(sprintf, c(as.list(x), fmt = "%s-%02s")))
df$id
#[1] "2034-05" "1023-12" "1042-22" "1231-09"

Related

R Subsetting text from a comma seperated column in a data-frame

I have a data.frame with a column that looks like that:
diagnosis
F.31.2,A.43.2,R.45.2,F.43.1
I want to somehow split this column into two colums with one containing all the values with F and one for all the other values, resulting in two columns in a df that looks like that.
F other
F.31.2,F43.1 A.43.2,R.45.2
Thanks in advance
Try next tidyverse approach. You can separate the rows by , and then create a group according to the pattern in order to reshape to wide and obtain the expected result:
library(dplyr)
library(tidyr)
#Data
df <- data.frame(diagnosis='F.31.2,A.43.2,R.45.2,F.43.1',stringsAsFactors = F)
#Code
new <- df %>% separate_rows(diagnosis,sep = ',') %>%
mutate(Group=ifelse(grepl('F',diagnosis),'F','Other')) %>%
pivot_wider(values_fn = toString,names_from=Group,values_from=diagnosis)
Output:
# A tibble: 1 x 2
F Other
<chr> <chr>
1 F.31.2, F.43.1 A.43.2, R.45.2
First, use strsplit at the commas. Then, using grep find indexes of F, and select/antiselect them by multiplying by 1 or -1 and paste them.
tmp <- el(strsplit(d$diagnosis, ","))
res <- lapply(c(1, -1), function(x) paste(tmp[grep("F", tmp)*x], collapse=","))
res <- setNames(as.data.frame(res), c("F", "other"))
res
# F other
# 1 F.31.2,F.43.1 A.43.2,R.45.2
Data:
d <- setNames(read.table(text="F.31.2,A.43.2,R.45.2,F.43.1"), "diagnosis")

Use REGEX in R to extract specific string in value as a new column?

I have a column that contains string of characters/values that looks like this
Current
111111~24-JUL-17 10:43:36~6.14
Desired Output
24-JUL-17 10:43:36
Hoping to take everything between the '~' --> So Date/Time and disregard everything else.
I am have this code right now but only seems to take part of it
df$Last <- gsub(".+\\s(.+)$", "\\1", df$col1)
We can use tidyr's separate to get below result:
library(dplyr)
library(tidyr)
df <- data.frame(c1 = c('111111~24-JUL-17 10:43:36~6.14','111111~24-JUL-21 10:34:36~6.14'))
df
c1
1 111111~24-JUL-17 10:43:36~6.14
2 111111~24-JUL-21 10:34:36~6.14
df %>% separate(col = c1, into = c('x','Date','y'), sep = '~') %>% select(2)
Date
1 24-JUL-17 10:43:36
2 24-JUL-21 10:34:36
Using stringr package:
library(dplyr)
library(stringr)
df %>% mutate(c1 = str_extract(c1, '(?<=~).*(?=~)'))
c1
1 24-JUL-17 10:43:36
2 24-JUL-21 10:34:36
We can use sub in base R
df$c1 <- sub(".*~([^~]+)~.*", "\\1", df$c1)
df$c1
#[1] "24-JUL-17 10:43:36" "24-JUL-21 10:34:36"
data
df <- data.frame(c1 = c('111111~24-JUL-17 10:43:36~6.14',
'111111~24-JUL-21 10:34:36~6.14'))

R: How to mutate new ID by modifying previous ID?

I asked the question(How to mutate a new column by modifying another column?)
Now I have another problem. I have to use more 'untidy'IDs like,
df1 <- data.frame(id=c("A-1","A-10","A-100","b-1","b-10","b-100"),n=c(1,2,3,4,5,6))
from this IDs, I want to assign new 'tidy' IDs like,
df2 <- data.frame(id=c("A0001","A0010","A0100","B0001","B0010","B0100"),n=c(1,2,3,4,5,6))
(now I need capital 'B' instead of 'b')
I tried to use str_pad functiuon, but I couldn't manage.
We can separate the data into different columns based on "-", convert the letters to uppercase, using sprintf pad with 0's and combine the two columns with unite.
library(dplyr)
library(tidyr)
df1 %>%
separate(id, c("id1", "id2"), sep = "-") %>%
mutate(id1 = toupper(id1),
id2 = sprintf('%04s', id2)) %>%
unite(id, id1, id2, sep = "")
# id n
#1 A0001 1
#2 A0010 2
#3 A0100 3
#4 B0001 4
#5 B0010 5
#6 B0100 6
Based on the comment if there are cases where we don't have separator and we want to change certain id1 values we can use the following.
df1 %>%
extract(id, c("id1", "id2"), regex = "([:alpha:])-?(\\d+)") %>%
mutate(id1 = case_when(id1 == 'c' ~ 'B',
TRUE ~ id1),
id1 = toupper(id1),id2 = sprintf('%04s', id2)) %>%
unite(id, id1, id2, sep = "")
The str_pad function is handy for this purpose, as you said. But you have to extract out the digits first and then paste it all back together.
library(stringr)
paste0(toupper(str_extract(df1$id, "[aA-zZ]-")),
str_pad(str_extract(df1$id, "\\d+"), width=4, pad="0"))
[1] "A-0001" "A-0010" "A-0100" "B-0001" "B-0010" "B-0100"
Base R solution
df1$id <- sub("^(.)0+?(.{4})$","\\1\\2", sub("-", "0000", toupper(df1$id)))
tidyverse solution
library(tidyverse)
df1$id <- str_to_upper(df1$id) %>%
str_replace("-","0000") %>%
str_replace("^(.)0+?(.{4})$","\\1\\2")
Output
df1
# id n
# 1 A0001 1
# 2 A0010 2
# 3 A0100 3
# 4 B0001 4
# 5 B0010 5
# 6 B0100 6
Data
df1 <- data.frame(id=c("A-1","A-10","A-100","b-1","b-10","b-100"),n=c(1,2,3,4,5,6))

Sum by aggregating complex paired names in R

In R, I'm trying to aggregate a dataframe based on unique IDs, BUT I need to use some kind of wild card value for the IDs. Meaning I have paired names like this:
lion_tiger
elephant_lion
tiger_lion
And I need the lion_tiger and tiger_lion IDs to be summed together, because the order in the pair does not matter.
Using this dataframe as an example:
df <- data.frame(pair = c("1_3","2_4","2_2","1_2","2_1","4_2","3_1","4_3","3_2"),
value = c("12","10","19","2","34","29","13","3","14"))
So the values for pair IDs, "1_2" and "2_1" need to be summed in a new table. That new row would then read:
1_2 36
Any suggestions? While my example has numbers as the pair IDs, in reality I would need this to read in text (like the lion_tiger" example above).
We can split the 'pair' column by _, then sort and paste it back, use it in a group by function to get the sum
tapply(as.numeric(as.character(df$value)),
sapply(strsplit(as.character(df$pair), '_'), function(x)
paste(sort(as.numeric(x)), collapse="_")), FUN = sum)
Or another option is gsubfn
library(gsubfn)
df$pair <- gsubfn('([0-9]+)_([0-9]+)', ~paste(sort(as.numeric(c(x, y))), collapse='_'),
as.character(df$pair))
df$value <- as.numeric(as.character(df$value))
aggregate(value~pair, df, sum)
Using tidyverse and purrrlyr
df <- data.frame(name=c("lion_tiger","elephant_lion",
"tiger_lion"),value=c(1,2,3),stringsAsFactors=FALSE)
require(tidyverse)
require(purrrlyr)
df %>% separate(col = name, sep = "_", c("A", "B")) %>%
by_row(.collate = "rows",
..f = function(this_row) {
paste0(sort(c(this_row$A, this_row$B)), collapse = "_")
}) %>%
rename(sorted = ".out") %>%
group_by(sorted) %>%
summarize(sum(value))%>%show
## A tibble: 2 x 2
# sorted `sum(value)`
# <chr> <dbl>
#1 elephant_lion 2
#2 lion_tiger 4

Create a loop for pasting or removing elements based on different scenarios

Say I have the following data set:
mydf <- data.frame( "MemberID"=c("111","0111A","0111B","112","0112A","113","0113B"),
"resign.date"=c("2013/01/01",NA,NA,"2014/03/01",NA,NA,NA))
Note: 111,112 and 113 are the IDs for the family representative.
I would like to do two things:
a) if I have the resign dates for a family representative for instance in the case of 111, I want to paste the same resign dates for 0111A and 0111B (These represent spouse and children of 111 if you're wondering)
b) if I don't have resign dates for the family representative, for instance 113, I would simply like to remove the rows 113 and 0113B.
My resulting data frame should look like this:
mydf <- data.frame("MemberID"=c("111","0111A","0111B","112","0112A"),
"resign.date"=c("2013/01/01","2013/01/01","2013/01/01","2014/03/01","2014/03/01"))
Thanks in advance.
If resign.date is only present for (some) MembersID without trailing letters, a solution using data.table
library(data.table)
df <- data.table( "MemberID"=c("0111","0111A","0111B","0112","0112A","0113","0113B"),
"resign.date"=c("2013/01/01",NA,NA,"2014/03/01",NA,NA,NA))
df <- df[order(MemberID)] ## order data : MemberIDs w/out trailing letters first by ID
df[, myID := gsub("\\D+", "", MemberID)] ## create myID col : MemberID w/out trailing letters
df[ , my.resign.date := resign.date[1L], by = myID] ##assign first occurrence of resign date by myID
df <- df[!is.na(my.resign.date)] ##drop rows if my.resign.date is missing
EDIT
If inconsistencies in MemberID (some have leading 0 some don't) you can try some work around as in what follows
df <- data.table( "MemberID"=c("111","0111A","0111B","112","0112A","113","0113B"),
"resign.date"=c("2013/01/01",NA,NA,"2014/03/01",NA,NA,NA))
df[, myID := gsub("(?<![0-9])0+", "", gsub("\\D+", "", MemberID), perl = TRUE)]
df <- df[order(myID, -MemberID)]
df[ , my.resign.date := resign.date[1L], by = myID]
df <- df[!is.na(my.resign.date)]
We can also use tidyverse
library(tidyverse)
mydf %>%
group_by(grp = parse_number(MemberID)) %>%
mutate(resign.date = first(resign.date)) %>%
na.omit() %>%
ungroup() %>%
select(-grp)
# A tibble: 5 x 2
# MemberID resign.date
# <fctr> <fctr>
#1 0111 2013/01/01
#2 0111A 2013/01/01
#3 0111B 2013/01/01
#4 0112 2014/03/01
#5 0112A 2014/03/01

Categories

Resources