I am trying to figure out how to reshape a dataset of the names of political parties from wide to long using dplyr and pivot_longer.
For each Party_ID, there is a number of constant columns attached (Party_Name_Short, Party_Name, Country, Party_in_orig_title) and a number of time changing factors as well: election, Date, Rename, Reason, Party_Title, alliance, member_parties, split, parent_party, merger, child_party, successor, predecessor. The time changing factors were recorded up to 11 times for each party, as reflected by the index in the colname.
In order to provide a sample I selected the first three time changing columns for each party and a sample of 5 random rows:
structure(list(Party_Name_Short = c("LZJ-PS", "ZiZi", "MNR",
"MDP", "E200"), Party_Name = c("Lista Zorana Jankovica – Pozitivna Slovenija",
"Živi zid", "Mouvement national républicain", "Movimento Democrático Português",
"Erakond Eesti 200"), Country = c("SVN", "HRV", "FRA", "PRT",
"EST"), Party_ID = c(1987, 2612, 1263, 1281, 2720), Party_in_orig_title = c(0,
0, 0, 0, 0), Date1 = c(2011, NA, 1999, 1987, NA), Rename1 = c("Lista Zorana Jankovica – Pozitivna Slovenija",
NA, "Mouvement national républicain", "ID", NA), Reason1 = c("foundation",
NA, "split from FN", "split", NA), Party_Title1 = c(0, NA, 0,
0, NA), alliance1 = c(0, NA, 0, 0, NA), member_parties1 = c(NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_),
split1 = c(0, NA, 1, 1, NA), parent_party1 = c(NA, NA, "FN",
"MDP", NA), merger1 = c(0, NA, 0, 0, NA), child_party1 = c(NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_
), successor1 = c(0, NA, 0, 0, NA), predecessor1 = c(NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_
), Date2 = c(2012, NA, NA, NA, NA), Rename2 = c("Pozitivna Slovenija",
NA, NA, NA, NA), Reason2 = c("renamed", NA, NA, NA, NA),
Party_Title2 = c(0, NA, NA, NA, NA), alliance2 = c(0, NA,
NA, NA, NA), member_parties2 = c(NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_), split2 = c(0,
NA, NA, NA, NA), parent_party2 = c(NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_), merger2 = c(0,
NA, NA, NA, NA), child_party2 = c(NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_), successor2 = c(0,
NA, NA, NA, NA), predecessor2 = c(NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_), Date3 = c(2014,
NA, NA, NA, NA), Rename3 = c("ZaAB", NA, NA, NA, NA), Reason3 = c("split",
NA, NA, NA, NA), Party_Title3 = c(0, NA, NA, NA, NA), alliance3 = c(0,
NA, NA, NA, NA), member_parties3 = c(NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_), split3 = c(1,
NA, NA, NA, NA), parent_party3 = c("LZJ-PS", NA, NA, NA,
NA), merger3 = c(0, NA, NA, NA, NA), child_party3 = c(NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_
), successor3 = c(0, NA, NA, NA, NA), predecessor3 = c(NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_
), election1 = structure(c(15309, 16740, 11839, 6390, 17956
), class = "Date"), election2 = structure(c(16252, NA, NA,
NA, NA), class = "Date"), election3 = structure(c(16344,
NA, NA, NA, NA), class = "Date")), row.names = c(NA, -5L), class = c("tbl_df",
"tbl", "data.frame"))
I would like the data to follow a "long" structure where each party_id and the constant factors are repeated 11 times and there are single columns for the time changing factors. Following the top-rated answer formulated here I tried different variations of the following command:
pivot_longer(cols = starts_with(c("election", "Date", "Rename", "Reason", "Party_Title",
"alliance", "member_parties", "split", "parent_party",
"merger", "child_party", "successor", "predecessor")),
names_to = c(".value", "election", "Date", "Rename", "Reason", "Party_Title",
"alliance", "member_parties", "split", "parent_party",
"merger", "child_party", "successor", "predecessor"), names_sep = "_") %>%
select(-matches("election[1-9]"), -matches("Date[1-9]"), -matches("Rename[1-9]"),
-matches("Reason[1-9]"), -matches("alliance[1-9]"), -matches("member_parties[1-9]"),
-matches("split[1-9]"), -matches("parent_party[1-9]"), -matches("merger[1-9]"),
-matches("child_party[1-9]"), -matches("successor[1-9]"), -matches("predecessor[1-9]"),
-matches("Party_Title[1-9]"), -matches("election1[0-2]"), -matches("Date1[0-2]"), -matches("Rename1[0-2]"),
-matches("Reason1[0-2]"), -matches("alliance1[0-2]"), -matches("member_parties1[0-2]"),
-matches("split1[0-2]"), -matches("parent_party1[0-2]"), -matches("merger1[0-2]"),
-matches("child_party1[0-2]"), -matches("successor1[0-2]"), -matches("predecessor1[0-2]"),
-matches("Party_Title1[0-2]"))
However, for some reason, I get a lot of missing values and do not achieve the shape of the data I would like to have. I'd appreciate any hint if you have an idea of how to do this. Thanks!
Update:
I would like the final output to look something like:
structure(list(Party_Name_Short = c("LZJ-PS", "ZiZi", "MNR",
"MDP", "E200", "LZJ-PS", "ZiZi", "MNR", "MDP", "E200", "LZJ-PS",
"ZiZi", "MNR", "MDP", "E200"), Party_Name = c("Lista Zorana Jankovica – Pozitivna Slovenija",
"Živi zid", "Mouvement national républicain", "Movimento Democrático Português",
"Erakond Eesti 200", "Lista Zorana Jankovica – Pozitivna Slovenija",
"Živi zid", "Mouvement national républicain", "Movimento Democrático Português",
"Erakond Eesti 200", "Lista Zorana Jankovica – Pozitivna Slovenija",
"Živi zid", "Mouvement national républicain", "Movimento Democrático Português",
"Erakond Eesti 200"), Country = c("SVN", "HRV", "FRA", "PRT",
"EST", "SVN", "HRV", "FRA", "PRT", "EST", "SVN", "HRV", "FRA",
"PRT", "EST"), Party_ID = c(1987, 2612, 1263, 1281, 2720, 1987,
2612, 1263, 1281, 2720, 1987, 2612, 1263, 1281, 2720), Party_in_orig_title = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), time = c(1, 1, 1,
1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3), Date = c(2011, NA, 1999,
1987, NA, 2012, NA, NA, NA, NA, 2014, NA, NA, NA, NA), Rename = c("Lista Zorana Jankovica – Pozitivna Slovenija",
NA, "Mouvement national républicain", "ID", NA, "Pozitivna Slovenija",
NA, NA, NA, NA, "ZaAB", NA, NA, NA, NA), Reason = c("foundation",
NA, "split from FN", "split", NA, "renamed", NA, NA, NA, NA,
"split", NA, NA, NA, NA), Party_Title = c(0, NA, 0, 0, NA, 0,
NA, NA, NA, NA, 0, NA, NA, NA, NA), alliance = c(0, NA, 0, 0,
NA, 0, NA, NA, NA, NA, 0, NA, NA, NA, NA), member_parties = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), split = c(0,
NA, 1, 1, NA, 0, NA, NA, NA, NA, 1, NA, NA, NA, NA), parent_party = c(NA,
NA, "FN", "MDP", NA, NA, NA, NA, NA, NA, "LZJ-PS", NA, NA, NA,
NA), merger = c(0, NA, 0, 0, NA, 0, NA, NA, NA, NA, 0, NA, NA,
NA, NA), child_party = c(NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA), successor = c(0, NA, 0, 0, NA, 0, NA,
NA, NA, NA, 0, NA, NA, NA, NA), predecessor = c(NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), election = structure(c(1322697600,
1446336000, 1022889600, 552096000, 1551398400, 1404172800, NA,
NA, NA, NA, 1412121600, NA, NA, NA, NA), class = c("POSIXct",
"POSIXt"), tzone = "UTC")), row.names = c(NA, -15L), class = c("tbl_df",
"tbl", "data.frame"))
Notice: the newly added time column and notice that this is only for example purposes, with three time changing factors, whereas in fact there are 11 in the data.
Using pivot_longer with names_sep to split between a non-digit and a digit at the end of the string
library(tidyr)
library(dplyr)
df1 %>%
pivot_longer(cols = matches('\\d+$'), names_to = c(".value", 'time'),
names_sep="(?<=\\D)(?=\\d+$)") %>%
arrange(time)
# A tibble: 15 x 19
# Party_Name_Short Party_Name Country Party_ID Party_in_orig_t… time Date Rename Reason Party_Title alliance member_parties split
# <chr> <chr> <chr> <dbl> <dbl> <chr> <dbl> <chr> <chr> <dbl> <dbl> <chr> <dbl>
# 1 LZJ-PS Lista Zor… SVN 1987 0 1 2011 Lista… found… 0 0 <NA> 0
# 2 ZiZi Živi zid HRV 2612 0 1 NA <NA> <NA> NA NA <NA> NA
# 3 MNR Mouvement… FRA 1263 0 1 1999 Mouve… split… 0 0 <NA> 1
# 4 MDP Movimento… PRT 1281 0 1 1987 ID split 0 0 <NA> 1
# 5 E200 Erakond E… EST 2720 0 1 NA <NA> <NA> NA NA <NA> NA
# 6 LZJ-PS Lista Zor… SVN 1987 0 2 2012 Pozit… renam… 0 0 <NA> 0
# 7 ZiZi Živi zid HRV 2612 0 2 NA <NA> <NA> NA NA <NA> NA
# 8 MNR Mouvement… FRA 1263 0 2 NA <NA> <NA> NA NA <NA> NA
# 9 MDP Movimento… PRT 1281 0 2 NA <NA> <NA> NA NA <NA> NA
#10 E200 Erakond E… EST 2720 0 2 NA <NA> <NA> NA NA <NA> NA
#11 LZJ-PS Lista Zor… SVN 1987 0 3 2014 ZaAB split 0 0 <NA> 1
#12 ZiZi Živi zid HRV 2612 0 3 NA <NA> <NA> NA NA <NA> NA
#13 MNR Mouvement… FRA 1263 0 3 NA <NA> <NA> NA NA <NA> NA
#14 MDP Movimento… PRT 1281 0 3 NA <NA> <NA> NA NA <NA> NA
#15 E200 Erakond E… EST 2720 0 3 NA <NA> <NA> NA NA <NA> NA
# … with 6 more variables: parent_party <chr>, merger <dbl>, child_party <chr>, successor <dbl>, predecessor <chr>, election <date>
Related
Df <- data.frame(prop1 = c(NA, NA, NA, "French", NA, NA,NA, "-29 to -20", NA, NA, NA, "Pop", NA, NA, NA, "French", "-29 to -20", "Pop"),
prop1_rank = c(NA, NA, NA, 0, NA, NA,NA, 11, NA, NA, NA, 1, NA, NA, NA, 40, 0, 2),
prop2 = c(NA, NA, NA, "Spanish", NA, NA,NA, "-19 to -10", NA, NA, NA, "Rock", NA, NA, NA, "Spanish", "-19 to -10", "Rock"),
prop2_rank = c(NA, NA, NA, 10, NA, NA,NA, 4, NA, NA, NA, 1, NA, NA, NA, 1, 0, 2),
initOSF1 = c(NA, NA, NA, NA, NA, "French", NA,NA,NA, "-29 to -20", NA, NA, NA, "Pop", NA, NA, NA, NA),
initOSF1_freq = c(NA, NA, NA, NA, NA, 66, NA,NA,NA, 0, NA, NA, NA, 14, NA, NA, NA, NA),
initOSF2 = c(NA, NA, NA, NA, NA, "Spanish", NA,NA,NA, "-19 to -10", NA, NA, NA, "Rock", NA, NA, NA, NA),
initOSF2_freq = c(NA, NA, NA, NA, NA, 0, NA,NA,NA, 6, NA, NA, NA, 14, NA, NA, NA, NA))
Df
I would like to organize this into
3 columns consisting: c("propositions", "ranks", "freqs"),
where,
Propositions column has the values: "French", "Spanish", "-29 to -20", "19 to -10", "Pop", "Rock", and having a separate columns for the rank values e.g., 0 for French, 10 for Spanish, etc., and frequency values e.g., 66 for French, 0 for Spanish, etc.
This is not an easy one. Probably a better solution exists:
library(tidyverse)
library(data.table)
setDT(Df) %>%
select(contains(c('prop', 'rank', 'freq'))) %>%
filter(!if_all(everything(), is.na)) %>%
melt(measure.vars = patterns(c('prop.$', 'rank$', 'freq'))) %>%
group_by(gr=cumsum(!is.na(value1)))%>%
summarise(across(-variable, ~if(length(.x)>1) na.omit(.x) else .x))
# A tibble: 12 x 4
gr value1 value2 value3
<int> <chr> <dbl> <dbl>
1 1 French 0 66
2 2 -29 to -20 11 0
3 3 Pop 1 14
4 4 French 40 NA
5 5 -29 to -20 0 NA
6 6 Pop 2 NA
7 7 Spanish 10 0
8 8 -19 to -10 4 6
9 9 Rock 1 14
10 10 Spanish 1 NA
11 11 -19 to -10 0 NA
12 12 Rock 2 NA
I want to print the rownames of a dataframe as vector if it matches a string of another dataframe.
I made a thesaurus with synonyms that looks like this
Synonym Synonym2 Synonym3
0010 01 beobachten U
0030 hkp <NA> <NA>
0040 hkp <NA> <NA>
0050 <NA> <NA> <NA>
0060 <NA> <NA> <NA>
0065 <NA> <NA> <NA>
0070 vipr perk <NA>
0080 oberfl anästh anästh <NA>
0090 vest inj vest inj inj sept blau
0100 l1 <NA> <NA>
0110 <NA> üz <NA>
0120 <NA> <NA> <NA>
1000 gezeigt zu achten putzdruck mhu
and I have a second dataframe:
PKV_clean
ID Aufzeichnungen
1 1 scharfkantig
2 1 t
3 1 aht 36 üz distal
4 1 seit paartagen
5 1 36 vipr
6 1 perk
7 1 üz bilfuird
8 1
9 1
10 1 knirscht
11 1 schiene empohlen
12 1 meldet
and it should print the rownames of the thesaurus when matching any string of a row (Synonym) with the second dataframe.
For example it should print (0070,0110) because Synonym "vipr" of 0070 and Synonym2 "üz" of 0110 matches with the second dataframe.
I tried it with:
#Sure things
a <- thesaurus[grep(PKV_clean$Aufzeichnungen,rownames(thesaurus))]
but this didn't work.
I have also tried creating a separate number for each case and create a 0|1 matrix. This works fine but with over 500 entries this is a lot of manual work.
PKV$"0070" = 0
PKV$"0070"[grepl("vipr | perk",PKV$Aufzeichnungen)] = 1
PKV$"0110" = 0
PKV$"0110"[grepl("üz",PKV$Aufzeichnungen)] = 1
output <- PKV|>
select(where(is.numeric)) |>
select(where(~ sum(.) > 0)) |>
names()
Thank you for your help!
#Mohammed Desouky
if I use the hole Dictionary with your code I get as output:
Edit 2:
dput(head(thesaurus , 40))
structure(list(Synonym = c("01", "hkp", "hkp", NA, NA, NA, "vipr",
"oberfl anästh", "vest inj", "l1", NA, NA, "gezeigt zu achten",
NA, NA, NA, "pzr", NA, "duraphat aufgetragen", NA, "bmf blutstillung",
NA, "fllg", "f1", "f2", "f2 sät", "f3", "mdv", "f4", NA, NA,
NA, NA, NA, NA, "st", NA, "metallprimer oberfläche", NA, NA),
Synonym2 = c("beobachten", NA, NA, NA, NA, NA, "perk", "anästh",
"vest inj", NA, NA, NA, "putzdruck", NA, NA, NA, "psi sbi api",
NA, "empfindliche stelle aufgetragen", NA, "cp", NA, "fllng",
NA, "fllng 2", "tetric flow", "fllg 3", NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, "monobond plus behandelt", NA, NA), Synonym3 = c("U",
NA, NA, NA, NA, NA, NA, NA, "inj sept blau", NA, NA, NA,
"mhu", NA, NA, NA, "dentalhygiene", NA, "üz", NA, "blutstillung",
NA, "f1", NA, "fllg 2", NA, "fllng 3", NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA), Synonym4 = c(NA, NA, NA,
NA, NA, NA, NA, NA, "cx pulpennah", NA, NA, NA, "gezeigt",
NA, NA, NA, NA, NA, "duraphat", NA, "bmf", NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
), Synonym5 = c(NA, NA, NA, NA, NA, NA, NA, NA, "infiltration",
NA, NA, NA, "pat gezeigt", NA, NA, NA, NA, NA, NA, NA, "visco gel",
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA), Synonym6 = c(NA, NA, NA, NA, NA, NA, NA,
NA, "injektion", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA), Synonym7 = c(NA, NA, NA, NA, NA,
NA, NA, NA, "infil", NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA), Synonym8 = c(NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_), Synonym9 = c(NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_)), row.names = c("0010",
"0030", "0040", "0050", "0060", "0065", "0070", "0080", "0090",
"0100", "0110", "0120", "1000", "1010", "1020", "1030", "1040",
"2000", "2010", "2020", "2030", "2040", "2050", "2060", "2070",
"2080", "2090", "2100", "2110", "2120", "2130", "2150", "2160",
"2170", "2180", "2190", "2195", "2197", "2200", "2210"), class = "data.frame")```
Can you work with this?
Try this
Synonym <- c()
for(i in 1:nrow(PKV_clean)){
s <- gsub("\\s+" , "|" , PKV_clean$Aufzeichnungen[i])
if(nchar(s) > 1){
ind <- rowSums(sapply(thesaurus , \(x) grepl(s , x)
))
} else ind <- rep(0 , nrow(thesaurus))
a <- which(ind >= 1)
if(length(a) > 0) ans <- toString(rownames(thesaurus)[a]) else ans <- ""
Synonym <- c(Synonym , ans)
}
PKV_clean$Synonym <- Synonym
output
ID Aufzeichnungen Synonym
1 1 scharfkantig
2 1 t
3 1 aht 36 üz distal 0110
4 1 seit paartagen
5 1 36 vipr 0070
6 1 perk 0070
7 1 üz bilfuird 0110
8 1
9 1
10 1 knirscht
11 1 schiene empohlen
12 1 meldet
I have a dataframe that looks something like this, in which I have several rows for each user, and many NAs in the columns.
user
Effect T1
Effect T2
Effect T3
Benchmark T1
Benchmark T2
Benchmark T3
Tom
01
NA
NA
02
NA
NA
Tom
NA
07
NA
NA
08
NA
Tom
NA
NA
13
NA
NA
14
Larry
03
NA
NA
04
NA
NA
Larry
NA
09
NA
NA
10
NA
Larry
NA
NA
15
NA
NA
16
Dave
05
NA
NA
06
NA
NA
Dave
NA
11
NA
NA
12
NA
Dave
NA
NA
17
NA
NA
18
I want to collapse the columns using the name and filling the values from reach row, this this.
user
Effect T1
Effect T2
Effect T3
Benchmark T1
Benchmark T2
Benchmark T3
Tom
01
07
13
02
08
14
Larry
03
09
15
04
10
16
Dave
05
11
17
06
12
18
How might I accomplish this?
Thank you in advance for your help. Update: I've added the dput of a subset of the actual data below.
structure(list(name = c("Abraham_Ralph", "Abraham_Ralph", "Abraham_Ralph",
"Ackerman_Gary", "Adams_Alma", "Adams_Alma", "Adams_Alma", "Adams_Alma",
"Adams_Sandy", "Aderholt_Robert", "Aderholt_Robert", "Aderholt_Robert",
"Aderholt_Robert", "Aderholt_Robert", "Aguilar_Pete", "Aguilar_Pete",
"Aguilar_Pete"), state = c("LA", "LA", "LA", "NY", "NC", "NC",
"NC", "NC", "FL", "AL", "AL", "AL", "AL", "AL", "CA", "CA", "CA"
), seniority = c(1, 2, 3, 15, 1, 2, 3, 4, 1, 8, 9, 10, 11, 12,
1, 2, 3), legeffect_112 = c(NA, NA, NA, 0.202061712741852, NA,
NA, NA, NA, 1.30758035182953, 3.73544979095459, NA, NA, NA, NA,
NA, NA, NA), legeffect_113 = c(NA, NA, NA, NA, 0, NA, NA, NA,
NA, NA, 0.908495426177979, NA, NA, NA, NA, NA, NA), legeffect_114 = c(2.07501077651978,
NA, NA, NA, NA, 0.84164834022522, NA, NA, NA, NA, NA, 0.340001106262207,
NA, NA, 0.10985741019249, NA, NA), legeffect_115 = c(NA, 0.493490308523178,
NA, NA, NA, NA, 0.587624311447144, NA, NA, NA, NA, NA, 0.159877583384514,
NA, NA, 0.730929613113403, NA), legeffect_116 = c(NA, NA, 0.0397605448961258,
NA, NA, NA, NA, 1.78378939628601, NA, NA, NA, NA, NA, 0.0198802724480629,
NA, NA, 0.0497006773948669), benchmark_112 = c(NA, NA, NA, 0.738679468631744,
NA, NA, NA, NA, 0.82908970117569, 1.39835929870605, NA, NA, NA,
NA, NA, NA, NA), benchmark_113 = c(NA, NA, NA, NA, 0.391001850366592,
NA, NA, NA, NA, NA, 1.58223271369934, NA, NA, NA, NA, NA, NA),
benchmark_114 = c(1.40446054935455, NA, NA, NA, NA, 0.576326191425323,
NA, NA, NA, NA, NA, 1.42212760448456, NA, NA, 0.574363172054291,
NA, NA), benchmark_115 = c(NA, 1.3291300535202, NA, NA, NA,
NA, 0.537361204624176, NA, NA, NA, NA, NA, 1.45703768730164,
NA, NA, 0.523149251937866, NA), benchmark_116 = c(NA, NA,
0.483340591192245, NA, NA, NA, NA, 1.31058621406555, NA,
NA, NA, NA, NA, 0.751261711120605, NA, NA, 1.05683290958405
)), row.names = c(NA, -17L), class = c("tbl_df", "tbl", "data.frame"
))
A data.table solution:
# melt data, remove NA, then recast ...
dt <- dcast(melt(data.table(d), "name")[!value %in% NA], name ~ variable)
dcast(melt(data.table(d), "name")[!value %in% c(NA) & !variable %in% c("variable", "seniority", "state")], name ~ variable)
name legeffect_112 legeffect_113 legeffect_114 legeffect_115 legeffect_116 benchmark_112 benchmark_113 benchmark_114 benchmark_115 benchmark_116
1: Abraham_Ralph <NA> <NA> 2.07501077651978 0.493490308523178 0.0397605448961258 <NA> <NA> 1.40446054935455 1.3291300535202 0.483340591192245
2: Ackerman_Gary 0.202061712741852 <NA> <NA> <NA> <NA> 0.738679468631744 <NA> <NA> <NA> <NA>
3: Adams_Alma <NA> 0 0.84164834022522 0.587624311447144 1.78378939628601 <NA> 0.391001850366592 0.576326191425323 0.537361204624176 1.31058621406555
4: Adams_Sandy 1.30758035182953 <NA> <NA> <NA> <NA> 0.82908970117569 <NA> <NA> <NA> <NA>
5: Aderholt_Robert 3.73544979095459 0.908495426177979 0.340001106262207 0.159877583384514 0.0198802724480629 1.39835929870605 1.58223271369934 1.42212760448456 1.45703768730164 0.751261711120605
6: Aguilar_Pete <NA> <NA> 0.10985741019249 0.730929613113403 0.0497006773948669 <NA> <NA> 0.574363172054291 0.523149251937866 1.05683290958405
Data/Setup
# Load data.table
# install.packages("data.table")
library(data.table)
# Read example data
d <- structure(list(name = c("Abraham_Ralph", "Abraham_Ralph", "Abraham_Ralph",
"Ackerman_Gary", "Adams_Alma", "Adams_Alma", "Adams_Alma", "Adams_Alma",
"Adams_Sandy", "Aderholt_Robert", "Aderholt_Robert", "Aderholt_Robert",
"Aderholt_Robert", "Aderholt_Robert", "Aguilar_Pete", "Aguilar_Pete",
"Aguilar_Pete"), state = c("LA", "LA", "LA", "NY", "NC", "NC",
"NC", "NC", "FL", "AL", "AL", "AL", "AL", "AL", "CA", "CA", "CA"
), seniority = c(1, 2, 3, 15, 1, 2, 3, 4, 1, 8, 9, 10, 11, 12,
1, 2, 3), legeffect_112 = c(NA, NA, NA, 0.202061712741852, NA,
NA, NA, NA, 1.30758035182953, 3.73544979095459, NA, NA, NA, NA,
NA, NA, NA), legeffect_113 = c(NA, NA, NA, NA, 0, NA, NA, NA,
NA, NA, 0.908495426177979, NA, NA, NA, NA, NA, NA), legeffect_114 = c(2.07501077651978,
NA, NA, NA, NA, 0.84164834022522, NA, NA, NA, NA, NA, 0.340001106262207,
NA, NA, 0.10985741019249, NA, NA), legeffect_115 = c(NA, 0.493490308523178,
NA, NA, NA, NA, 0.587624311447144, NA, NA, NA, NA, NA, 0.159877583384514,
NA, NA, 0.730929613113403, NA), legeffect_116 = c(NA, NA, 0.0397605448961258,
NA, NA, NA, NA, 1.78378939628601, NA, NA, NA, NA, NA, 0.0198802724480629,
NA, NA, 0.0497006773948669), benchmark_112 = c(NA, NA, NA, 0.738679468631744,
NA, NA, NA, NA, 0.82908970117569, 1.39835929870605, NA, NA, NA,
NA, NA, NA, NA), benchmark_113 = c(NA, NA, NA, NA, 0.391001850366592,
NA, NA, NA, NA, NA, 1.58223271369934, NA, NA, NA, NA, NA, NA),
benchmark_114 = c(1.40446054935455, NA, NA, NA, NA, 0.576326191425323,
NA, NA, NA, NA, NA, 1.42212760448456, NA, NA, 0.574363172054291,
NA, NA), benchmark_115 = c(NA, 1.3291300535202, NA, NA, NA,
NA, 0.537361204624176, NA, NA, NA, NA, NA, 1.45703768730164,
NA, NA, 0.523149251937866, NA), benchmark_116 = c(NA, NA,
0.483340591192245, NA, NA, NA, NA, 1.31058621406555, NA,
NA, NA, NA, NA, 0.751261711120605, NA, NA, 1.05683290958405
)), row.names = c(NA, -17L), class = c("tbl_df", "tbl", "data.frame"
))
This solution is using only the base functions (no extra packages), but the one-liner may cause eyes to cross, so I'll split it into several functions.
The plan is the following:
Split the original data.frame by the values in name column, using the function by;
For each partition of the data.frame, collapse the columns;
A collapsed column returns the max value of the column, or NA if all its values are NA;
The collapsed data.frame partitions are stacked together.
So, this is a function that does that:
dfr_collapse <- function(dfr, col0)
{
# Collapse the columns of the data.frame "dfr" grouped by the values of
# the column "col0"
# Max/NA function
namax <- function(x)
{
if(all(is.na(x)))
NA # !!!
else
max(x, na.rm=TRUE)
}
# Column collapse function
byfun <- function(x)
{
lapply(x, namax)
}
# Stack the partitioning results
return(do.call(
what = rbind,
args = by(dfr, dfr[[col0]], byfun)
))
}
May not look as slick as a one-liner, but it does the job. It can be tunrned into a one-liner, but you don't want that.
Assuming that df0 is the data.frame from you dput, you can test this function with
dfr_collapse(df0)
Nota bene: for the sake of simplicity, I return an NA of type logical (see the comment # !!! above). The correct code should convert that NA to the mode of the x vector. Also, the function should check the type of its inputs, etc.
here example of my data
mydat=structure(list(ADR.N.14.0 = c(8140010250001, 8140010250002),
NOMYAR.N.16.6 = c(1, 1), KOFPOR1.N.16.6 = c(7, 10), POR1.C.254 = c("о",
"BB"), VOZPOR1.N.16.6 = c(80, 45), VYSPOR1.N.16.6 = c(24,
17), DEMPOR1.N.16.6 = c(36, 16), POLNOT1.N.16.6 = c(0.6,
0.9), ZAPZAH1.N.16.6 = c(210, 160), NOMYAR2.N.16.6 = c(1,
1), KOFSOCT2.N.16.6 = c(3, 0), POR2.C.254 = c("BB", "о"),
VOZPOR2.N.16.6 = c(70, 45), VYSPOR2.N.16.6 = c(22, 17), DEMPOR2.N.16.6 = c(26,
22), POLNOT2.N.16.6 = c(0, 0), ZAPZAH2.N.16.6 = c(0, 0)), class = "data.frame", row.names = c(NA,
-2L))
how for each value of ADR,N,14,0move data from one variable under another.
To be more clear
here variables with prefix1
NOMYAR,N,16,6 KOFPOR**1**,N,16,6 POR**1**,C,254 VOZPOR**1**,N,16,6 VYSPOR**1**,N,16,6 DEMPOR**1**,N,16,6 POLNOT**1**,N,16,6 ZAPZAH**1**,N,16,6
and near rows with prefix2
NOMYAR**2**,N,16,6 KOFPOR**2**,N,16,6 POR**2**,C,254 VOZPOR**2**,N,16,6 VYSPOR**1**,N,16,6 DEMPOR**2**,N,16,6 POLNOT**2**,N,16,6 ZAPZAH**2**,N,16,6
so i need that for for ADR,N,14,0 =8140010250001
the content of the fields with the prefix 2 was under the content of the fields with the prefix 1
like this
result=structure(list(ADR.N.14.0 = c(8140010250001, 8140010250001, 8140010250002,
8140010250002, NA, NA, NA, NA, NA, NA), NOMYAR.N.16.6 = c(1,
1, 1, 1, NA, NA, NA, NA, NA, NA), KOFPOR1.N.16.6 = c(7, 3, 10,
0, NA, NA, NA, NA, NA, NA), POR1.C.254 = c("о", "BB", "BB", "о",
"", "", "", "", "", ""), VOZPOR1.N.16.6 = c(80, 70, 45, 45, NA,
NA, NA, NA, NA, NA), VYSPOR1.N.16.6 = c(24, 22, 17, 17, NA, NA,
NA, NA, NA, NA), DEMPOR1.N.16.6 = c(36, 26, 16, 22, NA, NA, NA,
NA, NA, NA), POLNOT1.N.16.6 = c(0.6, 0, 0.9, 0, NA, NA, NA, NA,
NA, NA), ZAPZAH1.N.16.6 = c(210, 0, 160, 0, NA, NA, NA, NA, NA,
NA)), class = "data.frame", row.names = c(NA, -10L))
How can i do such transpose?
You can use pivot_longer and specify names_pattern to include pattern of names that you want together.
tidyr::pivot_longer(mydat, cols = -ADR.N.14.0,
names_to = c('.value'),
names_pattern = '(.*?)\\d?\\..*')
# ADR.N.14.0 NOMYAR KOFPOR POR VOZPOR VYSPOR DEMPOR POLNOT ZAPZAH KOFSOCT
# <dbl> <dbl> <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 8140010250001 1 7 о 80 24 36 0.6 210 3
#2 8140010250001 1 NA BB 70 22 26 0 0 NA
#3 8140010250002 1 10 BB 45 17 16 0.9 160 0
#4 8140010250002 1 NA о 45 17 22 0 0 NA
I fitted a model and want to take a look (and plot) at the partial dependencies.
For this task, I use the mlr package. However, since I have 80 features I only want to look at those with the highest effect on the target variable. Is there a way to calculate or show the partial dependence only for the features with the highest influence?
Here is an example: I just fitted 4 values. Let´s assume I only want to look or calculate the partial dependence for the 2 most influential features.
library(mlr)
pd = generatePartialDependenceData(mod, train_task, c("diveyTrue", "dinnerTrue","BikeParkingTrue", "latenightTrue"))
pd
PartialDependenceData
Task: dat
Features: diveyTrue, dinnerTrue, BikeParkingTrue, latenightTrue
Target: diveyTrue, dinnerTrue, BikeParkingTrue, latenightTrue
Derivative: FALSE
Interaction: FALSE
Individual: FALSE
review_count diveyTrue dinnerTrue BikeParkingTrue latenightTrue
1: 73.92993 0.0000000 NA NA NA
2: 73.68386 0.1111111 NA NA NA
3: 73.68386 0.2222222 NA NA NA
4: 73.68386 0.3333333 NA NA NA
5: 73.68386 0.4444444 NA NA NA
6: 63.56335 0.5555556 NA NA NA
... (#rows: 40, #cols: 5)
The task is a regression and the first column is the target variable. All other variables are dummies. Therefore, the target variable stays constant until the value of "diveyTrue" is greater than 0.5.
Here is a small dput():
structure(list(data = structure(list(review_count = c(73.9299260484918,
73.6838552698629, 73.6838552698629, 73.6838552698629, 73.6838552698629,
63.5633491608329, 63.5633491608329, 63.5633491608329, 63.5633491608329,
63.5633491608329, 44.123492893074, 44.0855985404284, 44.0855985404284,
44.0855985404284, 44.0855985404284, 67.9185575263356, 67.9185575263356,
67.9185575263356, 67.9185575263356, 67.9185575263356, 64.1248331786005,
64.1243679505065, 64.1243679505065, 64.1243679505065, 64.1243679505065,
64.9177431842816, 64.9177431842816, 64.9177431842816, 64.9177431842816,
64.9177431842816, 58.2709529252224, 58.2709529252224, 58.2709529252224,
58.2709529252224, 58.2709529252224, 89.8281204749236, 89.8281204749236,
89.8281204749236, 89.8281204749236, 89.8281204749236), diveyTrue = c(0,
0.111111111111111, 0.222222222222222, 0.333333333333333, 0.444444444444444,
0.555555555555556, 0.666666666666667, 0.777777777777778, 0.888888888888889,
1, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA),
dinnerTrue = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 0,
0.111111111111111, 0.222222222222222, 0.333333333333333,
0.444444444444444, 0.555555555555556, 0.666666666666667,
0.777777777777778, 0.888888888888889, 1, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA), BikeParkingTrue = c(NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 0, 0.111111111111111,
0.222222222222222, 0.333333333333333, 0.444444444444444,
0.555555555555556, 0.666666666666667, 0.777777777777778,
0.888888888888889, 1, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA), latenightTrue = c(NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, 0, 0.111111111111111, 0.222222222222222,
0.333333333333333, 0.444444444444444, 0.555555555555556,
0.666666666666667, 0.777777777777778, 0.888888888888889,
1)), row.names = c(NA, -40L), class = c("data.table", "data.frame"
), .internal.selfref = <pointer: 0x0000000002521ef0>), task.desc = structure(list(
id = "dat", type = "regr", target = "review_count", size = 9943L,
n.feat = c(numerics = 79L, factors = 0L, ordered = 0L, functionals = 0L
), has.missings = TRUE, has.weights = FALSE, has.blocking = FALSE,
has.coordinates = FALSE), class = c("RegrTaskDesc", "SupervisedTaskDesc",
"TaskDesc")), target = c("diveyTrue", "dinnerTrue", "BikeParkingTrue",
"latenightTrue"), features = c("diveyTrue", "dinnerTrue", "BikeParkingTrue",
"latenightTrue"), derivative = FALSE, interaction = FALSE, individual = FALSE), class = "PartialDependenceData")