If strings of two dataframes match print rowname - r

I want to print the rownames of a dataframe as vector if it matches a string of another dataframe.
I made a thesaurus with synonyms that looks like this
Synonym Synonym2 Synonym3
0010 01 beobachten U
0030 hkp <NA> <NA>
0040 hkp <NA> <NA>
0050 <NA> <NA> <NA>
0060 <NA> <NA> <NA>
0065 <NA> <NA> <NA>
0070 vipr perk <NA>
0080 oberfl anästh anästh <NA>
0090 vest inj vest inj inj sept blau
0100 l1 <NA> <NA>
0110 <NA> üz <NA>
0120 <NA> <NA> <NA>
1000 gezeigt zu achten putzdruck mhu
and I have a second dataframe:
PKV_clean
ID Aufzeichnungen
1 1 scharfkantig
2 1 t
3 1 aht 36 üz distal
4 1 seit paartagen
5 1 36 vipr
6 1 perk
7 1 üz bilfuird
8 1
9 1
10 1 knirscht
11 1 schiene empohlen
12 1 meldet
and it should print the rownames of the thesaurus when matching any string of a row (Synonym) with the second dataframe.
For example it should print (0070,0110) because Synonym "vipr" of 0070 and Synonym2 "üz" of 0110 matches with the second dataframe.
I tried it with:
#Sure things
a <- thesaurus[grep(PKV_clean$Aufzeichnungen,rownames(thesaurus))]
but this didn't work.
I have also tried creating a separate number for each case and create a 0|1 matrix. This works fine but with over 500 entries this is a lot of manual work.
PKV$"0070" = 0
PKV$"0070"[grepl("vipr | perk",PKV$Aufzeichnungen)] = 1
PKV$"0110" = 0
PKV$"0110"[grepl("üz",PKV$Aufzeichnungen)] = 1
output <- PKV|>
select(where(is.numeric)) |>
select(where(~ sum(.) > 0)) |>
names()
Thank you for your help!
#Mohammed Desouky
if I use the hole Dictionary with your code I get as output:
Edit 2:
dput(head(thesaurus , 40))
structure(list(Synonym = c("01", "hkp", "hkp", NA, NA, NA, "vipr",
"oberfl anästh", "vest inj", "l1", NA, NA, "gezeigt zu achten",
NA, NA, NA, "pzr", NA, "duraphat aufgetragen", NA, "bmf blutstillung",
NA, "fllg", "f1", "f2", "f2 sät", "f3", "mdv", "f4", NA, NA,
NA, NA, NA, NA, "st", NA, "metallprimer oberfläche", NA, NA),
Synonym2 = c("beobachten", NA, NA, NA, NA, NA, "perk", "anästh",
"vest inj", NA, NA, NA, "putzdruck", NA, NA, NA, "psi sbi api",
NA, "empfindliche stelle aufgetragen", NA, "cp", NA, "fllng",
NA, "fllng 2", "tetric flow", "fllg 3", NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, "monobond plus behandelt", NA, NA), Synonym3 = c("U",
NA, NA, NA, NA, NA, NA, NA, "inj sept blau", NA, NA, NA,
"mhu", NA, NA, NA, "dentalhygiene", NA, "üz", NA, "blutstillung",
NA, "f1", NA, "fllg 2", NA, "fllng 3", NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA), Synonym4 = c(NA, NA, NA,
NA, NA, NA, NA, NA, "cx pulpennah", NA, NA, NA, "gezeigt",
NA, NA, NA, NA, NA, "duraphat", NA, "bmf", NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
), Synonym5 = c(NA, NA, NA, NA, NA, NA, NA, NA, "infiltration",
NA, NA, NA, "pat gezeigt", NA, NA, NA, NA, NA, NA, NA, "visco gel",
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA), Synonym6 = c(NA, NA, NA, NA, NA, NA, NA,
NA, "injektion", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA), Synonym7 = c(NA, NA, NA, NA, NA,
NA, NA, NA, "infil", NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA), Synonym8 = c(NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_), Synonym9 = c(NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_)), row.names = c("0010",
"0030", "0040", "0050", "0060", "0065", "0070", "0080", "0090",
"0100", "0110", "0120", "1000", "1010", "1020", "1030", "1040",
"2000", "2010", "2020", "2030", "2040", "2050", "2060", "2070",
"2080", "2090", "2100", "2110", "2120", "2130", "2150", "2160",
"2170", "2180", "2190", "2195", "2197", "2200", "2210"), class = "data.frame")```
Can you work with this?

Try this
Synonym <- c()
for(i in 1:nrow(PKV_clean)){
s <- gsub("\\s+" , "|" , PKV_clean$Aufzeichnungen[i])
if(nchar(s) > 1){
ind <- rowSums(sapply(thesaurus , \(x) grepl(s , x)
))
} else ind <- rep(0 , nrow(thesaurus))
a <- which(ind >= 1)
if(length(a) > 0) ans <- toString(rownames(thesaurus)[a]) else ans <- ""
Synonym <- c(Synonym , ans)
}
PKV_clean$Synonym <- Synonym
output
ID Aufzeichnungen Synonym
1 1 scharfkantig
2 1 t
3 1 aht 36 üz distal 0110
4 1 seit paartagen
5 1 36 vipr 0070
6 1 perk 0070
7 1 üz bilfuird 0110
8 1
9 1
10 1 knirscht
11 1 schiene empohlen
12 1 meldet

Related

Collapsing Dataframe Rows along several variables

I have a dataframe that looks something like this, in which I have several rows for each user, and many NAs in the columns.
user
Effect T1
Effect T2
Effect T3
Benchmark T1
Benchmark T2
Benchmark T3
Tom
01
NA
NA
02
NA
NA
Tom
NA
07
NA
NA
08
NA
Tom
NA
NA
13
NA
NA
14
Larry
03
NA
NA
04
NA
NA
Larry
NA
09
NA
NA
10
NA
Larry
NA
NA
15
NA
NA
16
Dave
05
NA
NA
06
NA
NA
Dave
NA
11
NA
NA
12
NA
Dave
NA
NA
17
NA
NA
18
I want to collapse the columns using the name and filling the values from reach row, this this.
user
Effect T1
Effect T2
Effect T3
Benchmark T1
Benchmark T2
Benchmark T3
Tom
01
07
13
02
08
14
Larry
03
09
15
04
10
16
Dave
05
11
17
06
12
18
How might I accomplish this?
Thank you in advance for your help. Update: I've added the dput of a subset of the actual data below.
structure(list(name = c("Abraham_Ralph", "Abraham_Ralph", "Abraham_Ralph",
"Ackerman_Gary", "Adams_Alma", "Adams_Alma", "Adams_Alma", "Adams_Alma",
"Adams_Sandy", "Aderholt_Robert", "Aderholt_Robert", "Aderholt_Robert",
"Aderholt_Robert", "Aderholt_Robert", "Aguilar_Pete", "Aguilar_Pete",
"Aguilar_Pete"), state = c("LA", "LA", "LA", "NY", "NC", "NC",
"NC", "NC", "FL", "AL", "AL", "AL", "AL", "AL", "CA", "CA", "CA"
), seniority = c(1, 2, 3, 15, 1, 2, 3, 4, 1, 8, 9, 10, 11, 12,
1, 2, 3), legeffect_112 = c(NA, NA, NA, 0.202061712741852, NA,
NA, NA, NA, 1.30758035182953, 3.73544979095459, NA, NA, NA, NA,
NA, NA, NA), legeffect_113 = c(NA, NA, NA, NA, 0, NA, NA, NA,
NA, NA, 0.908495426177979, NA, NA, NA, NA, NA, NA), legeffect_114 = c(2.07501077651978,
NA, NA, NA, NA, 0.84164834022522, NA, NA, NA, NA, NA, 0.340001106262207,
NA, NA, 0.10985741019249, NA, NA), legeffect_115 = c(NA, 0.493490308523178,
NA, NA, NA, NA, 0.587624311447144, NA, NA, NA, NA, NA, 0.159877583384514,
NA, NA, 0.730929613113403, NA), legeffect_116 = c(NA, NA, 0.0397605448961258,
NA, NA, NA, NA, 1.78378939628601, NA, NA, NA, NA, NA, 0.0198802724480629,
NA, NA, 0.0497006773948669), benchmark_112 = c(NA, NA, NA, 0.738679468631744,
NA, NA, NA, NA, 0.82908970117569, 1.39835929870605, NA, NA, NA,
NA, NA, NA, NA), benchmark_113 = c(NA, NA, NA, NA, 0.391001850366592,
NA, NA, NA, NA, NA, 1.58223271369934, NA, NA, NA, NA, NA, NA),
benchmark_114 = c(1.40446054935455, NA, NA, NA, NA, 0.576326191425323,
NA, NA, NA, NA, NA, 1.42212760448456, NA, NA, 0.574363172054291,
NA, NA), benchmark_115 = c(NA, 1.3291300535202, NA, NA, NA,
NA, 0.537361204624176, NA, NA, NA, NA, NA, 1.45703768730164,
NA, NA, 0.523149251937866, NA), benchmark_116 = c(NA, NA,
0.483340591192245, NA, NA, NA, NA, 1.31058621406555, NA,
NA, NA, NA, NA, 0.751261711120605, NA, NA, 1.05683290958405
)), row.names = c(NA, -17L), class = c("tbl_df", "tbl", "data.frame"
))
A data.table solution:
# melt data, remove NA, then recast ...
dt <- dcast(melt(data.table(d), "name")[!value %in% NA], name ~ variable)
dcast(melt(data.table(d), "name")[!value %in% c(NA) & !variable %in% c("variable", "seniority", "state")], name ~ variable)
name legeffect_112 legeffect_113 legeffect_114 legeffect_115 legeffect_116 benchmark_112 benchmark_113 benchmark_114 benchmark_115 benchmark_116
1: Abraham_Ralph <NA> <NA> 2.07501077651978 0.493490308523178 0.0397605448961258 <NA> <NA> 1.40446054935455 1.3291300535202 0.483340591192245
2: Ackerman_Gary 0.202061712741852 <NA> <NA> <NA> <NA> 0.738679468631744 <NA> <NA> <NA> <NA>
3: Adams_Alma <NA> 0 0.84164834022522 0.587624311447144 1.78378939628601 <NA> 0.391001850366592 0.576326191425323 0.537361204624176 1.31058621406555
4: Adams_Sandy 1.30758035182953 <NA> <NA> <NA> <NA> 0.82908970117569 <NA> <NA> <NA> <NA>
5: Aderholt_Robert 3.73544979095459 0.908495426177979 0.340001106262207 0.159877583384514 0.0198802724480629 1.39835929870605 1.58223271369934 1.42212760448456 1.45703768730164 0.751261711120605
6: Aguilar_Pete <NA> <NA> 0.10985741019249 0.730929613113403 0.0497006773948669 <NA> <NA> 0.574363172054291 0.523149251937866 1.05683290958405
Data/Setup
# Load data.table
# install.packages("data.table")
library(data.table)
# Read example data
d <- structure(list(name = c("Abraham_Ralph", "Abraham_Ralph", "Abraham_Ralph",
"Ackerman_Gary", "Adams_Alma", "Adams_Alma", "Adams_Alma", "Adams_Alma",
"Adams_Sandy", "Aderholt_Robert", "Aderholt_Robert", "Aderholt_Robert",
"Aderholt_Robert", "Aderholt_Robert", "Aguilar_Pete", "Aguilar_Pete",
"Aguilar_Pete"), state = c("LA", "LA", "LA", "NY", "NC", "NC",
"NC", "NC", "FL", "AL", "AL", "AL", "AL", "AL", "CA", "CA", "CA"
), seniority = c(1, 2, 3, 15, 1, 2, 3, 4, 1, 8, 9, 10, 11, 12,
1, 2, 3), legeffect_112 = c(NA, NA, NA, 0.202061712741852, NA,
NA, NA, NA, 1.30758035182953, 3.73544979095459, NA, NA, NA, NA,
NA, NA, NA), legeffect_113 = c(NA, NA, NA, NA, 0, NA, NA, NA,
NA, NA, 0.908495426177979, NA, NA, NA, NA, NA, NA), legeffect_114 = c(2.07501077651978,
NA, NA, NA, NA, 0.84164834022522, NA, NA, NA, NA, NA, 0.340001106262207,
NA, NA, 0.10985741019249, NA, NA), legeffect_115 = c(NA, 0.493490308523178,
NA, NA, NA, NA, 0.587624311447144, NA, NA, NA, NA, NA, 0.159877583384514,
NA, NA, 0.730929613113403, NA), legeffect_116 = c(NA, NA, 0.0397605448961258,
NA, NA, NA, NA, 1.78378939628601, NA, NA, NA, NA, NA, 0.0198802724480629,
NA, NA, 0.0497006773948669), benchmark_112 = c(NA, NA, NA, 0.738679468631744,
NA, NA, NA, NA, 0.82908970117569, 1.39835929870605, NA, NA, NA,
NA, NA, NA, NA), benchmark_113 = c(NA, NA, NA, NA, 0.391001850366592,
NA, NA, NA, NA, NA, 1.58223271369934, NA, NA, NA, NA, NA, NA),
benchmark_114 = c(1.40446054935455, NA, NA, NA, NA, 0.576326191425323,
NA, NA, NA, NA, NA, 1.42212760448456, NA, NA, 0.574363172054291,
NA, NA), benchmark_115 = c(NA, 1.3291300535202, NA, NA, NA,
NA, 0.537361204624176, NA, NA, NA, NA, NA, 1.45703768730164,
NA, NA, 0.523149251937866, NA), benchmark_116 = c(NA, NA,
0.483340591192245, NA, NA, NA, NA, 1.31058621406555, NA,
NA, NA, NA, NA, 0.751261711120605, NA, NA, 1.05683290958405
)), row.names = c(NA, -17L), class = c("tbl_df", "tbl", "data.frame"
))
This solution is using only the base functions (no extra packages), but the one-liner may cause eyes to cross, so I'll split it into several functions.
The plan is the following:
Split the original data.frame by the values in name column, using the function by;
For each partition of the data.frame, collapse the columns;
A collapsed column returns the max value of the column, or NA if all its values are NA;
The collapsed data.frame partitions are stacked together.
So, this is a function that does that:
dfr_collapse <- function(dfr, col0)
{
# Collapse the columns of the data.frame "dfr" grouped by the values of
# the column "col0"
# Max/NA function
namax <- function(x)
{
if(all(is.na(x)))
NA # !!!
else
max(x, na.rm=TRUE)
}
# Column collapse function
byfun <- function(x)
{
lapply(x, namax)
}
# Stack the partitioning results
return(do.call(
what = rbind,
args = by(dfr, dfr[[col0]], byfun)
))
}
May not look as slick as a one-liner, but it does the job. It can be tunrned into a one-liner, but you don't want that.
Assuming that df0 is the data.frame from you dput, you can test this function with
dfr_collapse(df0)
Nota bene: for the sake of simplicity, I return an NA of type logical (see the comment # !!! above). The correct code should convert that NA to the mode of the x vector. Also, the function should check the type of its inputs, etc.

Missing cases while using summarise(across())

I have data.frame that looks like this:
I want to quickly reshape it so I will only one record for each ID, something that is looks like this:
df can be build using codes:
df<-structure(list(ID = structure(c("05-102", "05-102", "05-102",
"01-103", "01-103", "01-103", "08-104", "08-104", "08-104", "05-105",
"05-105", "05-105", "02-106", "02-106", "02-106", "05-107", "05-107",
"05-107", "08-108", "08-108", "08-108", "02-109", "02-109", "02-109",
"05-111", "05-111", "05-111", "07-115", "07-115", "07-115"), label = "Unique Subject Identifier", format.sas = "$"),
EXSTDTC1 = structure(c(NA, NA, NA, 17022, NA, NA, 17024,
NA, NA, 17032, NA, NA, 17038, NA, NA, 17092, NA, NA, 17108,
NA, NA, 17155, NA, NA, 17247, NA, NA, 17333, NA, NA), class = "Date"),
EXSTDTC6 = structure(c(NA, 16885, NA, NA, NA, 17031, NA,
NA, 17032, NA, NA, 17041, NA, NA, 17047, NA, NA, 17100, NA,
NA, 17116, NA, 17164, NA, NA, NA, 17256, NA, 17342, NA), class = "Date"),
EXSTDTC3 = structure(c(NA, NA, 16881, NA, 17027, NA, NA,
17029, NA, NA, 17037, NA, NA, 17043, NA, NA, 17097, NA, NA,
17113, NA, NA, NA, 17160, NA, 17252, NA, NA, NA, 17338), class = "Date"),
EXDOSEA1 = c("73.8+147.6", NA, NA, "64.5+129", NA, NA, "62.7+125.4",
NA, NA, "114+57", NA, NA, "60+117.5", NA, NA, "48.6+97.2",
NA, NA, "61.2+122.4", NA, NA, "47.7+95.4", NA, NA, "51.6+103.2",
NA, NA, "68+136", NA, NA), EXDOSEA6 = c(NA, "100", NA, NA,
NA, "86", NA, NA, "83.5", NA, NA, "76", NA, NA, "39.2", NA,
NA, "32", NA, NA, "81.5", NA, "69.6", NA, NA, NA, "68", NA,
"91", NA), EXDOSEA3 = c(NA, NA, "1600", NA, "4302", NA, NA,
"4185", NA, NA, "3900", NA, NA, "3921", NA, NA, "3300", NA,
NA, "4080", NA, NA, NA, "3183", NA, "3300", NA, NA, NA, "1514"
)), row.names = c(NA, -30L), class = c("tbl_df", "tbl", "data.frame"
))
right now I have my codes as:
df %>%
group_by(ID) %>%
summarise(across(EXSTDTC1:EXDOSEA3, na.omit))
But it seems remove the 05-102 as it did not have value on EXSTDTC1. I would like to see how we can address this. Is it possible to keep across still?
Many thanks.
We could use an if/else condition to address those cases where there is only NA
library(dplyr)
df %>%
group_by(ID) %>%
summarise(across(EXSTDTC1:EXDOSEA3,
~ if(all(is.na(.))) NA else .[complete.cases(.)]), .groups = 'drop')
-output
# A tibble: 10 x 7
# ID EXSTDTC1 EXSTDTC6 EXSTDTC3 EXDOSEA1 EXDOSEA6 EXDOSEA3
# <chr> <date> <date> <date> <chr> <chr> <chr>
# 1 01-103 2016-08-09 2016-08-18 2016-08-14 64.5+129 86 4302
# 2 02-106 2016-08-25 2016-09-03 2016-08-30 60+117.5 39.2 3921
# 3 02-109 2016-12-20 2016-12-29 2016-12-25 47.7+95.4 69.6 3183
# 4 05-102 NA 2016-03-25 2016-03-21 73.8+147.6 100 1600
# 5 05-105 2016-08-19 2016-08-28 2016-08-24 114+57 76 3900
# 6 05-107 2016-10-18 2016-10-26 2016-10-23 48.6+97.2 32 3300
# 7 05-111 2017-03-22 2017-03-31 2017-03-27 51.6+103.2 68 3300
# 8 07-115 2017-06-16 2017-06-25 2017-06-21 68+136 91 1514
# 9 08-104 2016-08-11 2016-08-19 2016-08-16 62.7+125.4 83.5 4185
#10 08-108 2016-11-03 2016-11-11 2016-11-08 61.2+122.4 81.5 4080

R reshape name value pairs from wide to long using pivot_longer

I am trying to figure out how to reshape a dataset of the names of political parties from wide to long using dplyr and pivot_longer.
For each Party_ID, there is a number of constant columns attached (Party_Name_Short, Party_Name, Country, Party_in_orig_title) and a number of time changing factors as well: election, Date, Rename, Reason, Party_Title, alliance, member_parties, split, parent_party, merger, child_party, successor, predecessor. The time changing factors were recorded up to 11 times for each party, as reflected by the index in the colname.
In order to provide a sample I selected the first three time changing columns for each party and a sample of 5 random rows:
structure(list(Party_Name_Short = c("LZJ-PS", "ZiZi", "MNR",
"MDP", "E200"), Party_Name = c("Lista Zorana Jankovica – Pozitivna Slovenija",
"Živi zid", "Mouvement national républicain", "Movimento Democrático Português",
"Erakond Eesti 200"), Country = c("SVN", "HRV", "FRA", "PRT",
"EST"), Party_ID = c(1987, 2612, 1263, 1281, 2720), Party_in_orig_title = c(0,
0, 0, 0, 0), Date1 = c(2011, NA, 1999, 1987, NA), Rename1 = c("Lista Zorana Jankovica – Pozitivna Slovenija",
NA, "Mouvement national républicain", "ID", NA), Reason1 = c("foundation",
NA, "split from FN", "split", NA), Party_Title1 = c(0, NA, 0,
0, NA), alliance1 = c(0, NA, 0, 0, NA), member_parties1 = c(NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_),
split1 = c(0, NA, 1, 1, NA), parent_party1 = c(NA, NA, "FN",
"MDP", NA), merger1 = c(0, NA, 0, 0, NA), child_party1 = c(NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_
), successor1 = c(0, NA, 0, 0, NA), predecessor1 = c(NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_
), Date2 = c(2012, NA, NA, NA, NA), Rename2 = c("Pozitivna Slovenija",
NA, NA, NA, NA), Reason2 = c("renamed", NA, NA, NA, NA),
Party_Title2 = c(0, NA, NA, NA, NA), alliance2 = c(0, NA,
NA, NA, NA), member_parties2 = c(NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_), split2 = c(0,
NA, NA, NA, NA), parent_party2 = c(NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_), merger2 = c(0,
NA, NA, NA, NA), child_party2 = c(NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_), successor2 = c(0,
NA, NA, NA, NA), predecessor2 = c(NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_), Date3 = c(2014,
NA, NA, NA, NA), Rename3 = c("ZaAB", NA, NA, NA, NA), Reason3 = c("split",
NA, NA, NA, NA), Party_Title3 = c(0, NA, NA, NA, NA), alliance3 = c(0,
NA, NA, NA, NA), member_parties3 = c(NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_), split3 = c(1,
NA, NA, NA, NA), parent_party3 = c("LZJ-PS", NA, NA, NA,
NA), merger3 = c(0, NA, NA, NA, NA), child_party3 = c(NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_
), successor3 = c(0, NA, NA, NA, NA), predecessor3 = c(NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_
), election1 = structure(c(15309, 16740, 11839, 6390, 17956
), class = "Date"), election2 = structure(c(16252, NA, NA,
NA, NA), class = "Date"), election3 = structure(c(16344,
NA, NA, NA, NA), class = "Date")), row.names = c(NA, -5L), class = c("tbl_df",
"tbl", "data.frame"))
I would like the data to follow a "long" structure where each party_id and the constant factors are repeated 11 times and there are single columns for the time changing factors. Following the top-rated answer formulated here I tried different variations of the following command:
pivot_longer(cols = starts_with(c("election", "Date", "Rename", "Reason", "Party_Title",
"alliance", "member_parties", "split", "parent_party",
"merger", "child_party", "successor", "predecessor")),
names_to = c(".value", "election", "Date", "Rename", "Reason", "Party_Title",
"alliance", "member_parties", "split", "parent_party",
"merger", "child_party", "successor", "predecessor"), names_sep = "_") %>%
select(-matches("election[1-9]"), -matches("Date[1-9]"), -matches("Rename[1-9]"),
-matches("Reason[1-9]"), -matches("alliance[1-9]"), -matches("member_parties[1-9]"),
-matches("split[1-9]"), -matches("parent_party[1-9]"), -matches("merger[1-9]"),
-matches("child_party[1-9]"), -matches("successor[1-9]"), -matches("predecessor[1-9]"),
-matches("Party_Title[1-9]"), -matches("election1[0-2]"), -matches("Date1[0-2]"), -matches("Rename1[0-2]"),
-matches("Reason1[0-2]"), -matches("alliance1[0-2]"), -matches("member_parties1[0-2]"),
-matches("split1[0-2]"), -matches("parent_party1[0-2]"), -matches("merger1[0-2]"),
-matches("child_party1[0-2]"), -matches("successor1[0-2]"), -matches("predecessor1[0-2]"),
-matches("Party_Title1[0-2]"))
However, for some reason, I get a lot of missing values and do not achieve the shape of the data I would like to have. I'd appreciate any hint if you have an idea of how to do this. Thanks!
Update:
I would like the final output to look something like:
structure(list(Party_Name_Short = c("LZJ-PS", "ZiZi", "MNR",
"MDP", "E200", "LZJ-PS", "ZiZi", "MNR", "MDP", "E200", "LZJ-PS",
"ZiZi", "MNR", "MDP", "E200"), Party_Name = c("Lista Zorana Jankovica – Pozitivna Slovenija",
"Živi zid", "Mouvement national républicain", "Movimento Democrático Português",
"Erakond Eesti 200", "Lista Zorana Jankovica – Pozitivna Slovenija",
"Živi zid", "Mouvement national républicain", "Movimento Democrático Português",
"Erakond Eesti 200", "Lista Zorana Jankovica – Pozitivna Slovenija",
"Živi zid", "Mouvement national républicain", "Movimento Democrático Português",
"Erakond Eesti 200"), Country = c("SVN", "HRV", "FRA", "PRT",
"EST", "SVN", "HRV", "FRA", "PRT", "EST", "SVN", "HRV", "FRA",
"PRT", "EST"), Party_ID = c(1987, 2612, 1263, 1281, 2720, 1987,
2612, 1263, 1281, 2720, 1987, 2612, 1263, 1281, 2720), Party_in_orig_title = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), time = c(1, 1, 1,
1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3), Date = c(2011, NA, 1999,
1987, NA, 2012, NA, NA, NA, NA, 2014, NA, NA, NA, NA), Rename = c("Lista Zorana Jankovica – Pozitivna Slovenija",
NA, "Mouvement national républicain", "ID", NA, "Pozitivna Slovenija",
NA, NA, NA, NA, "ZaAB", NA, NA, NA, NA), Reason = c("foundation",
NA, "split from FN", "split", NA, "renamed", NA, NA, NA, NA,
"split", NA, NA, NA, NA), Party_Title = c(0, NA, 0, 0, NA, 0,
NA, NA, NA, NA, 0, NA, NA, NA, NA), alliance = c(0, NA, 0, 0,
NA, 0, NA, NA, NA, NA, 0, NA, NA, NA, NA), member_parties = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), split = c(0,
NA, 1, 1, NA, 0, NA, NA, NA, NA, 1, NA, NA, NA, NA), parent_party = c(NA,
NA, "FN", "MDP", NA, NA, NA, NA, NA, NA, "LZJ-PS", NA, NA, NA,
NA), merger = c(0, NA, 0, 0, NA, 0, NA, NA, NA, NA, 0, NA, NA,
NA, NA), child_party = c(NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA), successor = c(0, NA, 0, 0, NA, 0, NA,
NA, NA, NA, 0, NA, NA, NA, NA), predecessor = c(NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), election = structure(c(1322697600,
1446336000, 1022889600, 552096000, 1551398400, 1404172800, NA,
NA, NA, NA, 1412121600, NA, NA, NA, NA), class = c("POSIXct",
"POSIXt"), tzone = "UTC")), row.names = c(NA, -15L), class = c("tbl_df",
"tbl", "data.frame"))
Notice: the newly added time column and notice that this is only for example purposes, with three time changing factors, whereas in fact there are 11 in the data.
Using pivot_longer with names_sep to split between a non-digit and a digit at the end of the string
library(tidyr)
library(dplyr)
df1 %>%
pivot_longer(cols = matches('\\d+$'), names_to = c(".value", 'time'),
names_sep="(?<=\\D)(?=\\d+$)") %>%
arrange(time)
# A tibble: 15 x 19
# Party_Name_Short Party_Name Country Party_ID Party_in_orig_t… time Date Rename Reason Party_Title alliance member_parties split
# <chr> <chr> <chr> <dbl> <dbl> <chr> <dbl> <chr> <chr> <dbl> <dbl> <chr> <dbl>
# 1 LZJ-PS Lista Zor… SVN 1987 0 1 2011 Lista… found… 0 0 <NA> 0
# 2 ZiZi Živi zid HRV 2612 0 1 NA <NA> <NA> NA NA <NA> NA
# 3 MNR Mouvement… FRA 1263 0 1 1999 Mouve… split… 0 0 <NA> 1
# 4 MDP Movimento… PRT 1281 0 1 1987 ID split 0 0 <NA> 1
# 5 E200 Erakond E… EST 2720 0 1 NA <NA> <NA> NA NA <NA> NA
# 6 LZJ-PS Lista Zor… SVN 1987 0 2 2012 Pozit… renam… 0 0 <NA> 0
# 7 ZiZi Živi zid HRV 2612 0 2 NA <NA> <NA> NA NA <NA> NA
# 8 MNR Mouvement… FRA 1263 0 2 NA <NA> <NA> NA NA <NA> NA
# 9 MDP Movimento… PRT 1281 0 2 NA <NA> <NA> NA NA <NA> NA
#10 E200 Erakond E… EST 2720 0 2 NA <NA> <NA> NA NA <NA> NA
#11 LZJ-PS Lista Zor… SVN 1987 0 3 2014 ZaAB split 0 0 <NA> 1
#12 ZiZi Živi zid HRV 2612 0 3 NA <NA> <NA> NA NA <NA> NA
#13 MNR Mouvement… FRA 1263 0 3 NA <NA> <NA> NA NA <NA> NA
#14 MDP Movimento… PRT 1281 0 3 NA <NA> <NA> NA NA <NA> NA
#15 E200 Erakond E… EST 2720 0 3 NA <NA> <NA> NA NA <NA> NA
# … with 6 more variables: parent_party <chr>, merger <dbl>, child_party <chr>, successor <dbl>, predecessor <chr>, election <date>

R: merge columns from same data.frame based on NA positions

I have a dataframe like this:
df <- data.frame(theme1=c("hello",NA,NA,NA), theme2=c(NA,"world",NA,NA), theme3=c(NA,NA,"good_morning",NA), theme4=c(NA,NA,NA,"good_evening"))
theme1 theme2 theme3 theme4
1 hello NA NA NA
2 NA world NA NA
3 NA NA good_morning NA
4 NA NA NA good_evening
Now i want to obtain one column with preserving the row order:
**Theme_merged**
hello
world
good_morning
good_evening
Tries:
merge_themes <- data.frame(cbind(mycol = na.omit(unlist(data2_tst[18:23]))), stringsAsFactors = F)
The above code works but does not preserve the row order so when i want to place the vector back to the original dataframe it does not match anymore.
Real data:
dput(head(data2_tst[18:23], n = 50))
structure(list(Theme1 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, "%Bedrukken%", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, "%Bedrukken%", NA, NA, NA, NA, NA, NA, NA, NA), Theme2 = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "%Nieuwste|Nieuwe|201[6:7]%",
"%Nieuwste|Nieuwe|201[6:7]%", "%Nieuwste|Nieuwe|201[6:7]%", NA,
NA, NA, NA, NA, "%Nieuwste|Nieuwe|201[6:7]%", "%Nieuwste|Nieuwe|201[6:7]%",
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "%Nieuwste|Nieuwe|201[6:7]%",
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "%Nieuwste|Nieuwe|201[6:7]%",
"%Nieuwste|Nieuwe|201[6:7]%"), Theme3 = c("%Nodig%", NA, "%Nodig%",
"%Nodig%", "%Nodig%", NA, NA, "%Nodig%", NA, "%Nodig%", NA, NA,
NA, NA, "%Nodig%", "%Nodig%", "%Nodig%", NA, NA, NA, NA, NA,
NA, "%Nodig%", "%Nodig%", NA, NA, "%Nodig%", NA, "%Nodig%", "%Nodig%",
"%Nodig%", NA, "%Nodig%", "%Nodig%", "%Nodig%", NA, NA, NA, "%Nodig%",
"%Nodig%", NA, "%Nodig%", NA, "%Nodig%", "%Nodig%", NA, "%Nodig%",
NA, NA), Theme4 = c(NA, "%Kopen%", NA, NA, NA, "%Kopen%", "%Kopen%",
NA, "%Kopen%", NA, NA, NA, NA, NA, NA, NA, NA, "%Kopen%", "%Kopen%",
NA, NA, "%Kopen%", "%Kopen%", NA, NA, "%Kopen%", "%Kopen%", NA,
"%Kopen%", NA, NA, NA, NA, NA, NA, NA, "%Kopen%", "%Kopen%",
"%Kopen%", NA, NA, NA, NA, "%Kopen%", NA, NA, "%Kopen%", NA,
NA, NA), Theme5 = c(NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_), Theme6 = c(NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_)), .Names = c("Theme1",
"Theme2", "Theme3", "Theme4", "Theme5", "Theme6"), row.names = 3:52, class = "data.frame")
Version 0.5.0 of dplyr introduced the coalesce function:
This version of dplyr gains a number of vector functions inspired by SQL. Two functions make it a little easier to eliminate or generate missing values:
Given a set of vectors, coalesce() finds the first non-missing value in each position.
To apply this to the sample data frame you can use:
df <- mutate_all(df, .funs = as.character)
df$merged <- with(df, coalesce(theme1, theme2, theme3, theme4))
I found it necessary to convert from factors to character to avoid an 'invalid factor levels' error.
On your real data no conversion is necessary:
df$merged <- with(df, coalesce(Theme1, Theme2, Theme3, Theme4, Theme5, Theme6)
In SQL this would be the COALESCE function:
apply(df, 1, function(r) c(na.omit(r), NA)[1])
# [1] "hello" "world" "good_morning" "good_evening"
df <- data.frame(
theme1=c("hello",NA,NA,NA),
theme2=c(NA,"world",NA,NA),
theme3=c(NA,NA,"good_morning",NA),
theme4=c(NA,NA,NA,"good_evening"),
stringsAsFactors = FALSE
)
On your example data na.omit(unlist(df2, use.names = FALSE)) will work fine, but it will fail if there is a row of only NA values:
df2 <- data.frame(
theme1=c("hello",NA,NA,NA,NA),
theme2=c(NA,"world",NA,NA,NA),
theme3=c(NA,NA,"good_morning",NA,NA),
theme4=c(NA,NA,NA,"good_evening",NA),
theme5=c(NA_character_,NA_character_,NA_character_,
NA_character_,NA_character_),
stringsAsFactors = FALSE
)
df2$X <- na.omit(unlist(df2, use.names = FALSE))
# Error in `$<-.data.frame`(`*tmp*`, "X", value = c("hello", "world", "good_morning", :
# replacement has 4 rows, data has 5
df2$X <- apply(df2, 1, function(r) c(na.omit(r), NA)[1])
# theme1 theme2 theme3 theme4 theme5 X
# 1 hello <NA> <NA> <NA> <NA> hello
# 2 <NA> world <NA> <NA> <NA> world
# 3 <NA> <NA> good_morning <NA> <NA> good_morning
# 4 <NA> <NA> <NA> good_evening <NA> good_evening
# 5 <NA> <NA> <NA> <NA> <NA> <NA>
Another option could be df2$X <- df2[cbind(1:nrow(df2), max.col(!is.na(df2)))]
Here's a tidyverse solution (uses dplyr and tidyr or just tidyverse)
library(tidyverse)
> df <- df %>%
gather("theme", "theme_merged", 1:4) %>%
filter(!is.na(theme_merged)) %>%
select(theme_merged)
> df
theme_merged
1 hello
2 world
3 good_morning
4 good_evening
This should work with your data:
new_df = c(as.matrix(df))
This line first converts the df to a matrix and binds all the columns in one vector with c().
new_df <- new_df[!is.na(new_df)]
And now we keep only the non-NA entries. If you want you can convert it back to a dataframe:
new_df <- data.frame(new_df);names(new_df) <- "Themes"

A series of ifelse statements incorrectly defaults to the last option with real not simulated data

I am trying to write a function that will uses values from variables stored in different columns to generate a new variable. The logic requires a series of ifelse statements. However, the final statement is always evaluating to true and I don't understand why.
Even more puzzling when I generated data for the MWE then the function works fine. But it still behaves bizarrely with a sample from the real data. I am guessing there is something in my environment that is causing mischief but I am now lost as how to investigate further.
FWIW I have tried writing this function in data.table and now dplyr syntax, and I get similar problems with both approaches.
Simpler functions that don't use ifelse statements seem to behave just fine.
gen_sofa_c <- function(data, map=NA, noradr=NA, dopa=NA, adr=NA, vasopressin=NA) {
library(dplyr)
# Extract the arguments and force conversion to string
pars <- as.list(match.call()[-1])
vasopressin <- as.character(pars$vasopressin)
noradr <- as.character(pars$noradr)
adr <- as.character(pars$adr)
dopa <- as.character(pars$dopa)
map <- as.character(pars$map)
# Default to NA
# if ("sofa_c" %in% names(data)) data$sofa_c <- NULL
# data$sofa_c <- as.numeric(NA)
return(
data %>%
# # Return 0 if MAP >= 70
mutate(sofa_c = ifelse(!is.na(map) & map >= 70, 0 , NA)) %>%
# # Return 1 if MAP < 70
mutate(sofa_c = ifelse(!is.na(map) & map < 70, 1 , sofa_c)) %>%
# # Return SOFA 2 if norad OR adr > 0.0 or dopamine > 5
mutate(sofa_c = ifelse(!is.na(noradr) & noradr > 0.0 , 2 , sofa_c)) %>%
# # Return SOFA 3 if norad OR adr > 0.1 or dopamine > 15
mutate(sofa_c = ifelse(!is.na(noradr) & noradr > 0.1 , 3 , sofa_c)) %>%
# # Return SOFA 4 if on vasopressin
mutate(sofa_c = ifelse(!is.na(vasopressin) & vasopressin > 0, 4 , sofa_c)) %>%
# Return sofa_c
select(sofa_c)
)
}
Here is the simulated data
# Simulate data
set.seed(1234)
tdata <- data.table(map=round(rnorm(100,70,10)), noradr=round(rnorm(100,0,1),2), vasopressin=sample(c(rep(NA,9),1)))
tdata[, noradr := ifelse(noradr < 0, NA, noradr)]
sofa_c <- gen_sofa_c(tdata, map=map, noradr=rx_norad, dopa=rx_dopa, adr=rx_adre, vasopressin=rx_vasopr)
table(sofa_c)
(cbind(tdata, sofa_c))
My output is this
R> table(sofa_c)
sofa_c
0 1 2 3 4
17 27 4 42 10
R> head((cbind(tdata, sofa_c)),10)
map noradr vasopressin sofa_c
1: 58 0.41 NA 3
2: 73 NA NA 0
3: 81 0.07 1 4
4: 47 NA NA 1
5: 74 NA NA 0
6: 75 0.17 NA 3
7: 64 NA NA 1
8: 65 0.17 NA 3
9: 64 0.35 NA 3
10: 61 NA NA 1
Here is the real data (as sample from >2 million rows)
nrow(ddata)
rdata <- ddata[runif(100,1,nrow(ddata)),.(map,norad=rx_norad,vasopressin=rx_vasopr)]
dput(rdata)
rm(sofa_c)
sofa_c <- gen_sofa_c(rdata, map=map, noradr=rx_norad, dopa=rx_dopa, adr=rx_adre, vasopressin=rx_vasopr)
table(sofa_c)
head((cbind(rdata, sofa_c)),10)
Here is the sample from the real data
R> dput(rdata)
structure(list(map = c(80, 82, 76, NA, 87, NA, NA, NA, NA, NA,
NA, NA, NA, 124, 65, 63, NA, 70, NA, NA, NA, NA, NA, NA, NA,
100, NA, NA, NA, NA, 85, 85, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, 97, 84, 0, 84, NA, 75, NA, NA, NA, 67, NA, 58, NA, 153,
122, NA, NA, 91, 90, NA, NA, 87, NA, 60, 72, 107, 62, NA, NA,
97, 88, NA, NA, NA, 60, 81, 80, NA, NA, 82, 72, NA, 98, NA, NA,
80, 82, NA, NA, NA, 68, NA, NA, 126, 90, 65, 67, NA), norad = c(NA,
NA, NA, NA, 0, NA, NA, 0.14, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, 0.18, 0.00952381, NA, NA, 0.12962963, NA, NA, NA, NA, NA,
NA, NA, NA, NA, 0.172222, NA, NA, NA, NA, NA, 0.0623529, NA,
NA, NA, NA, 0.29005848, NA, NA, NA, NA, NA, NA, NA, NA, NA, 0.221667,
NA, NA, NA, NA, NA, 0.02, NA, NA, NA, NA, NA, 0.08, NA, NA, NA,
NA, NA, NA, NA, NA, 0.284444444, NA, NA, 0.19, NA, NA, NA, NA,
4, NA, NA), vasopressin = c(NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, 2, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA)), .Names = c("map",
"norad", "vasopressin"), row.names = c(NA, -100L), class = c("data.table",
"data.frame"), .internal.selfref = <pointer: 0x101810978>)
Here is the real data output
R> table(sofa_c)
sofa_c
3 4
99 1
R> head((cbind(rdata, sofa_c)),10)
map norad vasopressin sofa_c
1: 80 NA NA 3
2: 82 NA NA 3
3: 76 NA NA 3
4: NA NA NA 3
5: 87 0.00 NA 3
6: NA NA NA 3
7: NA NA NA 3
8: NA 0.14 NA 3
9: NA NA NA 3
10: NA NA NA 3

Resources