I have a dataframe like this:
df <- data.frame(theme1=c("hello",NA,NA,NA), theme2=c(NA,"world",NA,NA), theme3=c(NA,NA,"good_morning",NA), theme4=c(NA,NA,NA,"good_evening"))
theme1 theme2 theme3 theme4
1 hello NA NA NA
2 NA world NA NA
3 NA NA good_morning NA
4 NA NA NA good_evening
Now i want to obtain one column with preserving the row order:
**Theme_merged**
hello
world
good_morning
good_evening
Tries:
merge_themes <- data.frame(cbind(mycol = na.omit(unlist(data2_tst[18:23]))), stringsAsFactors = F)
The above code works but does not preserve the row order so when i want to place the vector back to the original dataframe it does not match anymore.
Real data:
dput(head(data2_tst[18:23], n = 50))
structure(list(Theme1 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, "%Bedrukken%", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, "%Bedrukken%", NA, NA, NA, NA, NA, NA, NA, NA), Theme2 = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "%Nieuwste|Nieuwe|201[6:7]%",
"%Nieuwste|Nieuwe|201[6:7]%", "%Nieuwste|Nieuwe|201[6:7]%", NA,
NA, NA, NA, NA, "%Nieuwste|Nieuwe|201[6:7]%", "%Nieuwste|Nieuwe|201[6:7]%",
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "%Nieuwste|Nieuwe|201[6:7]%",
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "%Nieuwste|Nieuwe|201[6:7]%",
"%Nieuwste|Nieuwe|201[6:7]%"), Theme3 = c("%Nodig%", NA, "%Nodig%",
"%Nodig%", "%Nodig%", NA, NA, "%Nodig%", NA, "%Nodig%", NA, NA,
NA, NA, "%Nodig%", "%Nodig%", "%Nodig%", NA, NA, NA, NA, NA,
NA, "%Nodig%", "%Nodig%", NA, NA, "%Nodig%", NA, "%Nodig%", "%Nodig%",
"%Nodig%", NA, "%Nodig%", "%Nodig%", "%Nodig%", NA, NA, NA, "%Nodig%",
"%Nodig%", NA, "%Nodig%", NA, "%Nodig%", "%Nodig%", NA, "%Nodig%",
NA, NA), Theme4 = c(NA, "%Kopen%", NA, NA, NA, "%Kopen%", "%Kopen%",
NA, "%Kopen%", NA, NA, NA, NA, NA, NA, NA, NA, "%Kopen%", "%Kopen%",
NA, NA, "%Kopen%", "%Kopen%", NA, NA, "%Kopen%", "%Kopen%", NA,
"%Kopen%", NA, NA, NA, NA, NA, NA, NA, "%Kopen%", "%Kopen%",
"%Kopen%", NA, NA, NA, NA, "%Kopen%", NA, NA, "%Kopen%", NA,
NA, NA), Theme5 = c(NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_), Theme6 = c(NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_)), .Names = c("Theme1",
"Theme2", "Theme3", "Theme4", "Theme5", "Theme6"), row.names = 3:52, class = "data.frame")
Version 0.5.0 of dplyr introduced the coalesce function:
This version of dplyr gains a number of vector functions inspired by SQL. Two functions make it a little easier to eliminate or generate missing values:
Given a set of vectors, coalesce() finds the first non-missing value in each position.
To apply this to the sample data frame you can use:
df <- mutate_all(df, .funs = as.character)
df$merged <- with(df, coalesce(theme1, theme2, theme3, theme4))
I found it necessary to convert from factors to character to avoid an 'invalid factor levels' error.
On your real data no conversion is necessary:
df$merged <- with(df, coalesce(Theme1, Theme2, Theme3, Theme4, Theme5, Theme6)
In SQL this would be the COALESCE function:
apply(df, 1, function(r) c(na.omit(r), NA)[1])
# [1] "hello" "world" "good_morning" "good_evening"
df <- data.frame(
theme1=c("hello",NA,NA,NA),
theme2=c(NA,"world",NA,NA),
theme3=c(NA,NA,"good_morning",NA),
theme4=c(NA,NA,NA,"good_evening"),
stringsAsFactors = FALSE
)
On your example data na.omit(unlist(df2, use.names = FALSE)) will work fine, but it will fail if there is a row of only NA values:
df2 <- data.frame(
theme1=c("hello",NA,NA,NA,NA),
theme2=c(NA,"world",NA,NA,NA),
theme3=c(NA,NA,"good_morning",NA,NA),
theme4=c(NA,NA,NA,"good_evening",NA),
theme5=c(NA_character_,NA_character_,NA_character_,
NA_character_,NA_character_),
stringsAsFactors = FALSE
)
df2$X <- na.omit(unlist(df2, use.names = FALSE))
# Error in `$<-.data.frame`(`*tmp*`, "X", value = c("hello", "world", "good_morning", :
# replacement has 4 rows, data has 5
df2$X <- apply(df2, 1, function(r) c(na.omit(r), NA)[1])
# theme1 theme2 theme3 theme4 theme5 X
# 1 hello <NA> <NA> <NA> <NA> hello
# 2 <NA> world <NA> <NA> <NA> world
# 3 <NA> <NA> good_morning <NA> <NA> good_morning
# 4 <NA> <NA> <NA> good_evening <NA> good_evening
# 5 <NA> <NA> <NA> <NA> <NA> <NA>
Another option could be df2$X <- df2[cbind(1:nrow(df2), max.col(!is.na(df2)))]
Here's a tidyverse solution (uses dplyr and tidyr or just tidyverse)
library(tidyverse)
> df <- df %>%
gather("theme", "theme_merged", 1:4) %>%
filter(!is.na(theme_merged)) %>%
select(theme_merged)
> df
theme_merged
1 hello
2 world
3 good_morning
4 good_evening
This should work with your data:
new_df = c(as.matrix(df))
This line first converts the df to a matrix and binds all the columns in one vector with c().
new_df <- new_df[!is.na(new_df)]
And now we keep only the non-NA entries. If you want you can convert it back to a dataframe:
new_df <- data.frame(new_df);names(new_df) <- "Themes"
Related
looking for answers similar to these posts; R: Replace multiple values in multiple columns of dataframes with NA ; Multiple replacement in R
My dataframe my.df contains NAs.
dput(my.df)
structure(list(`AICAR (GDSC1:1001)_GDSC1` = c(10.1253052794007,
NA, NA, NA, NA, NA, 9.3362273693641, NA, NA, NA), `vinblastine (GDSC1:1004)_GDSC1` = c(-5.56689193211021,
NA, NA, NA, NA, NA, -3.49808657768651, NA, NA, -5.7323006155361
), `cisplatin (GDSC1:1005)_GDSC1` = c(3.20680858158152, NA, NA,
NA, NA, NA, NA, NA, NA, NA), `cytarabine (GDSC1:1006)_GDSC1` = c(-1.29089026889862,
NA, NA, NA, NA, NA, NA, NA, NA, NA), `docetaxel (GDSC1:1007)_GDSC1` = c(-9.21190331946225,
NA, NA, NA, NA, NA, NA, NA, NA, -6.51430196744496), `methotrexate (GDSC1:1008)_GDSC1` = c(NA,
NA, NA, NA, NA, NA, -4.96153980941858, NA, NA, NA), `gefitinib (GDSC1:1010)_GDSC1` = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, -4.65609368323825), `navitoclax (GDSC1:1011)_GDSC1` = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_), `vorinostat (GDSC1:1012)_GDSC1` = c(-0.1834250603902,
1.80666265545084, 0.503152683902549, 1.78569632218743, NA, 1.01934567070847,
0.321867836558935, NA, 2.18003424956055, 0.143794452798708)), row.names = c(NA,
10L), class = "data.frame")
I get the cell location of each NA using idx <- my.df %>% lapply(., function(x) which(is.na(x)))
Convert these NAs to 0 by my.df %>% mutate_if(.,is.numeric, ~replace(., is.na(.), 0)) before I calculate correlations.
Now how can I return the NAs into their dedicated cells based on theidx?
I recon loops, tidy, purrr or something similar can do this fast? Would be great if a match could be done between the column names of my.df and the names of idx for quality control in the code.
Thanks!
I want to print the rownames of a dataframe as vector if it matches a string of another dataframe.
I made a thesaurus with synonyms that looks like this
Synonym Synonym2 Synonym3
0010 01 beobachten U
0030 hkp <NA> <NA>
0040 hkp <NA> <NA>
0050 <NA> <NA> <NA>
0060 <NA> <NA> <NA>
0065 <NA> <NA> <NA>
0070 vipr perk <NA>
0080 oberfl anästh anästh <NA>
0090 vest inj vest inj inj sept blau
0100 l1 <NA> <NA>
0110 <NA> üz <NA>
0120 <NA> <NA> <NA>
1000 gezeigt zu achten putzdruck mhu
and I have a second dataframe:
PKV_clean
ID Aufzeichnungen
1 1 scharfkantig
2 1 t
3 1 aht 36 üz distal
4 1 seit paartagen
5 1 36 vipr
6 1 perk
7 1 üz bilfuird
8 1
9 1
10 1 knirscht
11 1 schiene empohlen
12 1 meldet
and it should print the rownames of the thesaurus when matching any string of a row (Synonym) with the second dataframe.
For example it should print (0070,0110) because Synonym "vipr" of 0070 and Synonym2 "üz" of 0110 matches with the second dataframe.
I tried it with:
#Sure things
a <- thesaurus[grep(PKV_clean$Aufzeichnungen,rownames(thesaurus))]
but this didn't work.
I have also tried creating a separate number for each case and create a 0|1 matrix. This works fine but with over 500 entries this is a lot of manual work.
PKV$"0070" = 0
PKV$"0070"[grepl("vipr | perk",PKV$Aufzeichnungen)] = 1
PKV$"0110" = 0
PKV$"0110"[grepl("üz",PKV$Aufzeichnungen)] = 1
output <- PKV|>
select(where(is.numeric)) |>
select(where(~ sum(.) > 0)) |>
names()
Thank you for your help!
#Mohammed Desouky
if I use the hole Dictionary with your code I get as output:
Edit 2:
dput(head(thesaurus , 40))
structure(list(Synonym = c("01", "hkp", "hkp", NA, NA, NA, "vipr",
"oberfl anästh", "vest inj", "l1", NA, NA, "gezeigt zu achten",
NA, NA, NA, "pzr", NA, "duraphat aufgetragen", NA, "bmf blutstillung",
NA, "fllg", "f1", "f2", "f2 sät", "f3", "mdv", "f4", NA, NA,
NA, NA, NA, NA, "st", NA, "metallprimer oberfläche", NA, NA),
Synonym2 = c("beobachten", NA, NA, NA, NA, NA, "perk", "anästh",
"vest inj", NA, NA, NA, "putzdruck", NA, NA, NA, "psi sbi api",
NA, "empfindliche stelle aufgetragen", NA, "cp", NA, "fllng",
NA, "fllng 2", "tetric flow", "fllg 3", NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, "monobond plus behandelt", NA, NA), Synonym3 = c("U",
NA, NA, NA, NA, NA, NA, NA, "inj sept blau", NA, NA, NA,
"mhu", NA, NA, NA, "dentalhygiene", NA, "üz", NA, "blutstillung",
NA, "f1", NA, "fllg 2", NA, "fllng 3", NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA), Synonym4 = c(NA, NA, NA,
NA, NA, NA, NA, NA, "cx pulpennah", NA, NA, NA, "gezeigt",
NA, NA, NA, NA, NA, "duraphat", NA, "bmf", NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
), Synonym5 = c(NA, NA, NA, NA, NA, NA, NA, NA, "infiltration",
NA, NA, NA, "pat gezeigt", NA, NA, NA, NA, NA, NA, NA, "visco gel",
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA), Synonym6 = c(NA, NA, NA, NA, NA, NA, NA,
NA, "injektion", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA), Synonym7 = c(NA, NA, NA, NA, NA,
NA, NA, NA, "infil", NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA), Synonym8 = c(NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_), Synonym9 = c(NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_)), row.names = c("0010",
"0030", "0040", "0050", "0060", "0065", "0070", "0080", "0090",
"0100", "0110", "0120", "1000", "1010", "1020", "1030", "1040",
"2000", "2010", "2020", "2030", "2040", "2050", "2060", "2070",
"2080", "2090", "2100", "2110", "2120", "2130", "2150", "2160",
"2170", "2180", "2190", "2195", "2197", "2200", "2210"), class = "data.frame")```
Can you work with this?
Try this
Synonym <- c()
for(i in 1:nrow(PKV_clean)){
s <- gsub("\\s+" , "|" , PKV_clean$Aufzeichnungen[i])
if(nchar(s) > 1){
ind <- rowSums(sapply(thesaurus , \(x) grepl(s , x)
))
} else ind <- rep(0 , nrow(thesaurus))
a <- which(ind >= 1)
if(length(a) > 0) ans <- toString(rownames(thesaurus)[a]) else ans <- ""
Synonym <- c(Synonym , ans)
}
PKV_clean$Synonym <- Synonym
output
ID Aufzeichnungen Synonym
1 1 scharfkantig
2 1 t
3 1 aht 36 üz distal 0110
4 1 seit paartagen
5 1 36 vipr 0070
6 1 perk 0070
7 1 üz bilfuird 0110
8 1
9 1
10 1 knirscht
11 1 schiene empohlen
12 1 meldet
I have a dataframe that looks something like this, in which I have several rows for each user, and many NAs in the columns.
user
Effect T1
Effect T2
Effect T3
Benchmark T1
Benchmark T2
Benchmark T3
Tom
01
NA
NA
02
NA
NA
Tom
NA
07
NA
NA
08
NA
Tom
NA
NA
13
NA
NA
14
Larry
03
NA
NA
04
NA
NA
Larry
NA
09
NA
NA
10
NA
Larry
NA
NA
15
NA
NA
16
Dave
05
NA
NA
06
NA
NA
Dave
NA
11
NA
NA
12
NA
Dave
NA
NA
17
NA
NA
18
I want to collapse the columns using the name and filling the values from reach row, this this.
user
Effect T1
Effect T2
Effect T3
Benchmark T1
Benchmark T2
Benchmark T3
Tom
01
07
13
02
08
14
Larry
03
09
15
04
10
16
Dave
05
11
17
06
12
18
How might I accomplish this?
Thank you in advance for your help. Update: I've added the dput of a subset of the actual data below.
structure(list(name = c("Abraham_Ralph", "Abraham_Ralph", "Abraham_Ralph",
"Ackerman_Gary", "Adams_Alma", "Adams_Alma", "Adams_Alma", "Adams_Alma",
"Adams_Sandy", "Aderholt_Robert", "Aderholt_Robert", "Aderholt_Robert",
"Aderholt_Robert", "Aderholt_Robert", "Aguilar_Pete", "Aguilar_Pete",
"Aguilar_Pete"), state = c("LA", "LA", "LA", "NY", "NC", "NC",
"NC", "NC", "FL", "AL", "AL", "AL", "AL", "AL", "CA", "CA", "CA"
), seniority = c(1, 2, 3, 15, 1, 2, 3, 4, 1, 8, 9, 10, 11, 12,
1, 2, 3), legeffect_112 = c(NA, NA, NA, 0.202061712741852, NA,
NA, NA, NA, 1.30758035182953, 3.73544979095459, NA, NA, NA, NA,
NA, NA, NA), legeffect_113 = c(NA, NA, NA, NA, 0, NA, NA, NA,
NA, NA, 0.908495426177979, NA, NA, NA, NA, NA, NA), legeffect_114 = c(2.07501077651978,
NA, NA, NA, NA, 0.84164834022522, NA, NA, NA, NA, NA, 0.340001106262207,
NA, NA, 0.10985741019249, NA, NA), legeffect_115 = c(NA, 0.493490308523178,
NA, NA, NA, NA, 0.587624311447144, NA, NA, NA, NA, NA, 0.159877583384514,
NA, NA, 0.730929613113403, NA), legeffect_116 = c(NA, NA, 0.0397605448961258,
NA, NA, NA, NA, 1.78378939628601, NA, NA, NA, NA, NA, 0.0198802724480629,
NA, NA, 0.0497006773948669), benchmark_112 = c(NA, NA, NA, 0.738679468631744,
NA, NA, NA, NA, 0.82908970117569, 1.39835929870605, NA, NA, NA,
NA, NA, NA, NA), benchmark_113 = c(NA, NA, NA, NA, 0.391001850366592,
NA, NA, NA, NA, NA, 1.58223271369934, NA, NA, NA, NA, NA, NA),
benchmark_114 = c(1.40446054935455, NA, NA, NA, NA, 0.576326191425323,
NA, NA, NA, NA, NA, 1.42212760448456, NA, NA, 0.574363172054291,
NA, NA), benchmark_115 = c(NA, 1.3291300535202, NA, NA, NA,
NA, 0.537361204624176, NA, NA, NA, NA, NA, 1.45703768730164,
NA, NA, 0.523149251937866, NA), benchmark_116 = c(NA, NA,
0.483340591192245, NA, NA, NA, NA, 1.31058621406555, NA,
NA, NA, NA, NA, 0.751261711120605, NA, NA, 1.05683290958405
)), row.names = c(NA, -17L), class = c("tbl_df", "tbl", "data.frame"
))
A data.table solution:
# melt data, remove NA, then recast ...
dt <- dcast(melt(data.table(d), "name")[!value %in% NA], name ~ variable)
dcast(melt(data.table(d), "name")[!value %in% c(NA) & !variable %in% c("variable", "seniority", "state")], name ~ variable)
name legeffect_112 legeffect_113 legeffect_114 legeffect_115 legeffect_116 benchmark_112 benchmark_113 benchmark_114 benchmark_115 benchmark_116
1: Abraham_Ralph <NA> <NA> 2.07501077651978 0.493490308523178 0.0397605448961258 <NA> <NA> 1.40446054935455 1.3291300535202 0.483340591192245
2: Ackerman_Gary 0.202061712741852 <NA> <NA> <NA> <NA> 0.738679468631744 <NA> <NA> <NA> <NA>
3: Adams_Alma <NA> 0 0.84164834022522 0.587624311447144 1.78378939628601 <NA> 0.391001850366592 0.576326191425323 0.537361204624176 1.31058621406555
4: Adams_Sandy 1.30758035182953 <NA> <NA> <NA> <NA> 0.82908970117569 <NA> <NA> <NA> <NA>
5: Aderholt_Robert 3.73544979095459 0.908495426177979 0.340001106262207 0.159877583384514 0.0198802724480629 1.39835929870605 1.58223271369934 1.42212760448456 1.45703768730164 0.751261711120605
6: Aguilar_Pete <NA> <NA> 0.10985741019249 0.730929613113403 0.0497006773948669 <NA> <NA> 0.574363172054291 0.523149251937866 1.05683290958405
Data/Setup
# Load data.table
# install.packages("data.table")
library(data.table)
# Read example data
d <- structure(list(name = c("Abraham_Ralph", "Abraham_Ralph", "Abraham_Ralph",
"Ackerman_Gary", "Adams_Alma", "Adams_Alma", "Adams_Alma", "Adams_Alma",
"Adams_Sandy", "Aderholt_Robert", "Aderholt_Robert", "Aderholt_Robert",
"Aderholt_Robert", "Aderholt_Robert", "Aguilar_Pete", "Aguilar_Pete",
"Aguilar_Pete"), state = c("LA", "LA", "LA", "NY", "NC", "NC",
"NC", "NC", "FL", "AL", "AL", "AL", "AL", "AL", "CA", "CA", "CA"
), seniority = c(1, 2, 3, 15, 1, 2, 3, 4, 1, 8, 9, 10, 11, 12,
1, 2, 3), legeffect_112 = c(NA, NA, NA, 0.202061712741852, NA,
NA, NA, NA, 1.30758035182953, 3.73544979095459, NA, NA, NA, NA,
NA, NA, NA), legeffect_113 = c(NA, NA, NA, NA, 0, NA, NA, NA,
NA, NA, 0.908495426177979, NA, NA, NA, NA, NA, NA), legeffect_114 = c(2.07501077651978,
NA, NA, NA, NA, 0.84164834022522, NA, NA, NA, NA, NA, 0.340001106262207,
NA, NA, 0.10985741019249, NA, NA), legeffect_115 = c(NA, 0.493490308523178,
NA, NA, NA, NA, 0.587624311447144, NA, NA, NA, NA, NA, 0.159877583384514,
NA, NA, 0.730929613113403, NA), legeffect_116 = c(NA, NA, 0.0397605448961258,
NA, NA, NA, NA, 1.78378939628601, NA, NA, NA, NA, NA, 0.0198802724480629,
NA, NA, 0.0497006773948669), benchmark_112 = c(NA, NA, NA, 0.738679468631744,
NA, NA, NA, NA, 0.82908970117569, 1.39835929870605, NA, NA, NA,
NA, NA, NA, NA), benchmark_113 = c(NA, NA, NA, NA, 0.391001850366592,
NA, NA, NA, NA, NA, 1.58223271369934, NA, NA, NA, NA, NA, NA),
benchmark_114 = c(1.40446054935455, NA, NA, NA, NA, 0.576326191425323,
NA, NA, NA, NA, NA, 1.42212760448456, NA, NA, 0.574363172054291,
NA, NA), benchmark_115 = c(NA, 1.3291300535202, NA, NA, NA,
NA, 0.537361204624176, NA, NA, NA, NA, NA, 1.45703768730164,
NA, NA, 0.523149251937866, NA), benchmark_116 = c(NA, NA,
0.483340591192245, NA, NA, NA, NA, 1.31058621406555, NA,
NA, NA, NA, NA, 0.751261711120605, NA, NA, 1.05683290958405
)), row.names = c(NA, -17L), class = c("tbl_df", "tbl", "data.frame"
))
This solution is using only the base functions (no extra packages), but the one-liner may cause eyes to cross, so I'll split it into several functions.
The plan is the following:
Split the original data.frame by the values in name column, using the function by;
For each partition of the data.frame, collapse the columns;
A collapsed column returns the max value of the column, or NA if all its values are NA;
The collapsed data.frame partitions are stacked together.
So, this is a function that does that:
dfr_collapse <- function(dfr, col0)
{
# Collapse the columns of the data.frame "dfr" grouped by the values of
# the column "col0"
# Max/NA function
namax <- function(x)
{
if(all(is.na(x)))
NA # !!!
else
max(x, na.rm=TRUE)
}
# Column collapse function
byfun <- function(x)
{
lapply(x, namax)
}
# Stack the partitioning results
return(do.call(
what = rbind,
args = by(dfr, dfr[[col0]], byfun)
))
}
May not look as slick as a one-liner, but it does the job. It can be tunrned into a one-liner, but you don't want that.
Assuming that df0 is the data.frame from you dput, you can test this function with
dfr_collapse(df0)
Nota bene: for the sake of simplicity, I return an NA of type logical (see the comment # !!! above). The correct code should convert that NA to the mode of the x vector. Also, the function should check the type of its inputs, etc.
I am trying to figure out how to reshape a dataset of the names of political parties from wide to long using dplyr and pivot_longer.
For each Party_ID, there is a number of constant columns attached (Party_Name_Short, Party_Name, Country, Party_in_orig_title) and a number of time changing factors as well: election, Date, Rename, Reason, Party_Title, alliance, member_parties, split, parent_party, merger, child_party, successor, predecessor. The time changing factors were recorded up to 11 times for each party, as reflected by the index in the colname.
In order to provide a sample I selected the first three time changing columns for each party and a sample of 5 random rows:
structure(list(Party_Name_Short = c("LZJ-PS", "ZiZi", "MNR",
"MDP", "E200"), Party_Name = c("Lista Zorana Jankovica – Pozitivna Slovenija",
"Živi zid", "Mouvement national républicain", "Movimento Democrático Português",
"Erakond Eesti 200"), Country = c("SVN", "HRV", "FRA", "PRT",
"EST"), Party_ID = c(1987, 2612, 1263, 1281, 2720), Party_in_orig_title = c(0,
0, 0, 0, 0), Date1 = c(2011, NA, 1999, 1987, NA), Rename1 = c("Lista Zorana Jankovica – Pozitivna Slovenija",
NA, "Mouvement national républicain", "ID", NA), Reason1 = c("foundation",
NA, "split from FN", "split", NA), Party_Title1 = c(0, NA, 0,
0, NA), alliance1 = c(0, NA, 0, 0, NA), member_parties1 = c(NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_),
split1 = c(0, NA, 1, 1, NA), parent_party1 = c(NA, NA, "FN",
"MDP", NA), merger1 = c(0, NA, 0, 0, NA), child_party1 = c(NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_
), successor1 = c(0, NA, 0, 0, NA), predecessor1 = c(NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_
), Date2 = c(2012, NA, NA, NA, NA), Rename2 = c("Pozitivna Slovenija",
NA, NA, NA, NA), Reason2 = c("renamed", NA, NA, NA, NA),
Party_Title2 = c(0, NA, NA, NA, NA), alliance2 = c(0, NA,
NA, NA, NA), member_parties2 = c(NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_), split2 = c(0,
NA, NA, NA, NA), parent_party2 = c(NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_), merger2 = c(0,
NA, NA, NA, NA), child_party2 = c(NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_), successor2 = c(0,
NA, NA, NA, NA), predecessor2 = c(NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_), Date3 = c(2014,
NA, NA, NA, NA), Rename3 = c("ZaAB", NA, NA, NA, NA), Reason3 = c("split",
NA, NA, NA, NA), Party_Title3 = c(0, NA, NA, NA, NA), alliance3 = c(0,
NA, NA, NA, NA), member_parties3 = c(NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_), split3 = c(1,
NA, NA, NA, NA), parent_party3 = c("LZJ-PS", NA, NA, NA,
NA), merger3 = c(0, NA, NA, NA, NA), child_party3 = c(NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_
), successor3 = c(0, NA, NA, NA, NA), predecessor3 = c(NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_
), election1 = structure(c(15309, 16740, 11839, 6390, 17956
), class = "Date"), election2 = structure(c(16252, NA, NA,
NA, NA), class = "Date"), election3 = structure(c(16344,
NA, NA, NA, NA), class = "Date")), row.names = c(NA, -5L), class = c("tbl_df",
"tbl", "data.frame"))
I would like the data to follow a "long" structure where each party_id and the constant factors are repeated 11 times and there are single columns for the time changing factors. Following the top-rated answer formulated here I tried different variations of the following command:
pivot_longer(cols = starts_with(c("election", "Date", "Rename", "Reason", "Party_Title",
"alliance", "member_parties", "split", "parent_party",
"merger", "child_party", "successor", "predecessor")),
names_to = c(".value", "election", "Date", "Rename", "Reason", "Party_Title",
"alliance", "member_parties", "split", "parent_party",
"merger", "child_party", "successor", "predecessor"), names_sep = "_") %>%
select(-matches("election[1-9]"), -matches("Date[1-9]"), -matches("Rename[1-9]"),
-matches("Reason[1-9]"), -matches("alliance[1-9]"), -matches("member_parties[1-9]"),
-matches("split[1-9]"), -matches("parent_party[1-9]"), -matches("merger[1-9]"),
-matches("child_party[1-9]"), -matches("successor[1-9]"), -matches("predecessor[1-9]"),
-matches("Party_Title[1-9]"), -matches("election1[0-2]"), -matches("Date1[0-2]"), -matches("Rename1[0-2]"),
-matches("Reason1[0-2]"), -matches("alliance1[0-2]"), -matches("member_parties1[0-2]"),
-matches("split1[0-2]"), -matches("parent_party1[0-2]"), -matches("merger1[0-2]"),
-matches("child_party1[0-2]"), -matches("successor1[0-2]"), -matches("predecessor1[0-2]"),
-matches("Party_Title1[0-2]"))
However, for some reason, I get a lot of missing values and do not achieve the shape of the data I would like to have. I'd appreciate any hint if you have an idea of how to do this. Thanks!
Update:
I would like the final output to look something like:
structure(list(Party_Name_Short = c("LZJ-PS", "ZiZi", "MNR",
"MDP", "E200", "LZJ-PS", "ZiZi", "MNR", "MDP", "E200", "LZJ-PS",
"ZiZi", "MNR", "MDP", "E200"), Party_Name = c("Lista Zorana Jankovica – Pozitivna Slovenija",
"Živi zid", "Mouvement national républicain", "Movimento Democrático Português",
"Erakond Eesti 200", "Lista Zorana Jankovica – Pozitivna Slovenija",
"Živi zid", "Mouvement national républicain", "Movimento Democrático Português",
"Erakond Eesti 200", "Lista Zorana Jankovica – Pozitivna Slovenija",
"Živi zid", "Mouvement national républicain", "Movimento Democrático Português",
"Erakond Eesti 200"), Country = c("SVN", "HRV", "FRA", "PRT",
"EST", "SVN", "HRV", "FRA", "PRT", "EST", "SVN", "HRV", "FRA",
"PRT", "EST"), Party_ID = c(1987, 2612, 1263, 1281, 2720, 1987,
2612, 1263, 1281, 2720, 1987, 2612, 1263, 1281, 2720), Party_in_orig_title = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), time = c(1, 1, 1,
1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3), Date = c(2011, NA, 1999,
1987, NA, 2012, NA, NA, NA, NA, 2014, NA, NA, NA, NA), Rename = c("Lista Zorana Jankovica – Pozitivna Slovenija",
NA, "Mouvement national républicain", "ID", NA, "Pozitivna Slovenija",
NA, NA, NA, NA, "ZaAB", NA, NA, NA, NA), Reason = c("foundation",
NA, "split from FN", "split", NA, "renamed", NA, NA, NA, NA,
"split", NA, NA, NA, NA), Party_Title = c(0, NA, 0, 0, NA, 0,
NA, NA, NA, NA, 0, NA, NA, NA, NA), alliance = c(0, NA, 0, 0,
NA, 0, NA, NA, NA, NA, 0, NA, NA, NA, NA), member_parties = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), split = c(0,
NA, 1, 1, NA, 0, NA, NA, NA, NA, 1, NA, NA, NA, NA), parent_party = c(NA,
NA, "FN", "MDP", NA, NA, NA, NA, NA, NA, "LZJ-PS", NA, NA, NA,
NA), merger = c(0, NA, 0, 0, NA, 0, NA, NA, NA, NA, 0, NA, NA,
NA, NA), child_party = c(NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA), successor = c(0, NA, 0, 0, NA, 0, NA,
NA, NA, NA, 0, NA, NA, NA, NA), predecessor = c(NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), election = structure(c(1322697600,
1446336000, 1022889600, 552096000, 1551398400, 1404172800, NA,
NA, NA, NA, 1412121600, NA, NA, NA, NA), class = c("POSIXct",
"POSIXt"), tzone = "UTC")), row.names = c(NA, -15L), class = c("tbl_df",
"tbl", "data.frame"))
Notice: the newly added time column and notice that this is only for example purposes, with three time changing factors, whereas in fact there are 11 in the data.
Using pivot_longer with names_sep to split between a non-digit and a digit at the end of the string
library(tidyr)
library(dplyr)
df1 %>%
pivot_longer(cols = matches('\\d+$'), names_to = c(".value", 'time'),
names_sep="(?<=\\D)(?=\\d+$)") %>%
arrange(time)
# A tibble: 15 x 19
# Party_Name_Short Party_Name Country Party_ID Party_in_orig_t… time Date Rename Reason Party_Title alliance member_parties split
# <chr> <chr> <chr> <dbl> <dbl> <chr> <dbl> <chr> <chr> <dbl> <dbl> <chr> <dbl>
# 1 LZJ-PS Lista Zor… SVN 1987 0 1 2011 Lista… found… 0 0 <NA> 0
# 2 ZiZi Živi zid HRV 2612 0 1 NA <NA> <NA> NA NA <NA> NA
# 3 MNR Mouvement… FRA 1263 0 1 1999 Mouve… split… 0 0 <NA> 1
# 4 MDP Movimento… PRT 1281 0 1 1987 ID split 0 0 <NA> 1
# 5 E200 Erakond E… EST 2720 0 1 NA <NA> <NA> NA NA <NA> NA
# 6 LZJ-PS Lista Zor… SVN 1987 0 2 2012 Pozit… renam… 0 0 <NA> 0
# 7 ZiZi Živi zid HRV 2612 0 2 NA <NA> <NA> NA NA <NA> NA
# 8 MNR Mouvement… FRA 1263 0 2 NA <NA> <NA> NA NA <NA> NA
# 9 MDP Movimento… PRT 1281 0 2 NA <NA> <NA> NA NA <NA> NA
#10 E200 Erakond E… EST 2720 0 2 NA <NA> <NA> NA NA <NA> NA
#11 LZJ-PS Lista Zor… SVN 1987 0 3 2014 ZaAB split 0 0 <NA> 1
#12 ZiZi Živi zid HRV 2612 0 3 NA <NA> <NA> NA NA <NA> NA
#13 MNR Mouvement… FRA 1263 0 3 NA <NA> <NA> NA NA <NA> NA
#14 MDP Movimento… PRT 1281 0 3 NA <NA> <NA> NA NA <NA> NA
#15 E200 Erakond E… EST 2720 0 3 NA <NA> <NA> NA NA <NA> NA
# … with 6 more variables: parent_party <chr>, merger <dbl>, child_party <chr>, successor <dbl>, predecessor <chr>, election <date>
I have 3 data frames and they are just replicates. So I want to bind them and calculate the mean of each fraction.
Three data frames:
Nr.1
> dput(head(tbl_gel1))
structure(list(Name = c("yal003w", "yal005c", "yal012w", "yal016w",
"yal035w", "yal038w"), `1_1` = c(1.08346521189121, NA, NA, NA,
NA, NA), `1_10` = c(0.267721905361376, 1.43303883148383, 1.61684304894131,
NA, NA, NA), `1_11` = c(0.189487668138674, 0.75522363065885,
1, NA, NA, NA), `1_12` = c(NA, 1.01340492119247, NA, NA, NA,
NA), `1_13` = c(0.374782308020683, 0.945489433731933, NA, NA,
NA, 0.0317297633029047), `1_14` = c(0.437488212634424, 1.18763709680314,
NA, NA, NA, 0.0278039649538794), `1_15` = c(1, 0.963283876302253,
NA, NA, NA, 0.101985769564935), `1_16` = c(0.933864874212228,
0.534233379286527, NA, NA, NA, 0.216767470594226), `1_17` = c(1,
0.665519263271478, NA, NA, 1, 1), `1_18` = c(0.666036574750145,
0.570465125348879, NA, NA, NA, 1.42894349812116), `1_19` = c(0.514337131747938,
0.23204076838128, NA, NA, 1, 1.2521214021452), `1_2` = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), `1_20` = c(NA,
NA, NA, NA, NA, 1.40803677399372), `1_21` = c(1.09990599806138,
NA, NA, NA, NA, 1.04631699593704), `1_22` = c(1.26442418472118,
NA, NA, NA, NA, 0.928872017485782), `1_23` = c(1.11596921281805,
NA, NA, NA, 1, 0.34698227364696), `1_24` = c(0.754496014447251,
NA, NA, NA, 1, 0.222234793614252), `1_3` = c(6.29254185223621,
NA, NA, 0.693642968439352, NA, NA), `1_4` = c(1.36347593974479,
NA, NA, 1, NA, NA), `1_5` = c(0.765885344543765, NA, NA, 1, NA,
NA), `1_6` = c(0.238118001668604, 0.679584207611477, NA, NA,
NA, NA), `1_7` = c(0.847897771442355, 0.277348019879946, NA,
NA, NA, NA), `1_8` = c(0.356154192700505, 1, 0.409523853881517,
NA, NA, NA), `1_9` = c(0.180109142324181, 1, 0.578310191227172,
NA, NA, 0.093113736249161)), .Names = c("Name", "1_1", "1_10",
"1_11", "1_12", "1_13", "1_14", "1_15", "1_16", "1_17", "1_18",
"1_19", "1_2", "1_20", "1_21", "1_22", "1_23", "1_24", "1_3",
"1_4", "1_5", "1_6", "1_7", "1_8", "1_9"), row.names = c(NA,
6L), class = "data.frame")
Nr. 2
> dput(head(tbl_gel2))
structure(list(Name = c("yal003w", "yal005c", "yal012w", "yal016w",
"yal035w", "yal038w"), `2_1` = c(NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_), `2_2` = c(1.0548947840373, NA,
NA, NA, NA, NA), `2_3` = c(1.61794716486303, 0.346821796129205,
NA, NA, NA, NA), `2_4` = c(1, NA, NA, 0.378254379051086, NA,
NA), `2_5` = c(0.670710809411423, NA, NA, 1, NA, NA), `2_6` = c(0.313872585645673,
NA, NA, NA, NA, NA), `2_7` = c(0.299293639466945, 0.13920907824675,
NA, NA, NA, NA), `2_8` = c(0.311431376422469, 0.511742245543671,
0.342807141055383, NA, NA, NA), `2_9` = c(0.243672215177189,
1, 0.689138745271004, NA, NA, 0.0540861571772987), `2_10` = c(0.154732102234279,
1.08973258347909, 1, NA, NA, NA), `2_11` = c(0.149365726324845,
1.1210733533474, 1.0427649268992, NA, NA, 0.0955468461925663),
`2_12` = c(0.153741630869067, 2.96276072446013, 1, NA, NA,
NA), `2_13` = c(0.629371115599316, 0.952868912207058, 0.0771105403237483,
NA, NA, 0.0885212695236819), `2_14` = c(0.907644486740723,
1.43000783337778, NA, NA, NA, 0.138102409899801), `2_15` = c(1.09683345304359,
0.423641943213571, NA, NA, NA, 0.255699738225622), `2_16` = c(0.913095779338154,
0.510977400533081, NA, NA, 0.520556617688936, 0.284898552722227
), `2_17` = c(0.935941553863477, 0.388225948821767, NA, NA,
1.14984991998928, 1), `2_18` = c(2.21746156904543, 0.642743615867438,
NA, NA, NA, 2.22716071647178), `2_19` = c(0.500618035526774,
0.282924681750454, NA, NA, NA, 1), `2_20` = c(0.701627311828743,
0.254001731153973, NA, NA, 1, 1.15996914621286), `2_21` = c(1.97359874904275,
NA, NA, NA, 1.67526802494991, 1.38709456754353), `2_22` = c(2.09198896289293,
NA, NA, NA, NA, 0.921672834103247), `2_23` = c(1.18791465369551,
NA, NA, NA, NA, 0.576309066193914), `2_24` = c(0.473199477125101,
0.176144702328764, NA, NA, 1, 0.130236848112641)), .Names = c("Name",
"2_1", "2_2", "2_3", "2_4", "2_5", "2_6", "2_7", "2_8", "2_9",
"2_10", "2_11", "2_12", "2_13", "2_14", "2_15", "2_16", "2_17",
"2_18", "2_19", "2_20", "2_21", "2_22", "2_23", "2_24"), row.names = c(NA,
6L), class = "data.frame")
Nr.3
> dput(head(tbl_gel3))
structure(list(Name = c("yal003w", "yal005c", "yal012w", "yal016w",
"yal035w", "yal038w"), `3_1` = c(NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_), `3_2` = c(1, 1.4605309655311,
NA, NA, NA, NA), `3_3` = c(1.74480713727388, 0.42825619952525,
NA, NA, NA, NA), `3_4` = c(1, 0.431712121875013, NA, 0.395182020245312,
NA, NA), `3_5` = c(2.26247329056518, 0.644462177666441, NA, 1,
NA, NA), `3_6` = c(0.619783374266709, 0.472094874244026, NA,
NA, NA, NA), `3_7` = c(0.45731912574756, 0.176354321796083, NA,
NA, NA, NA), `3_8` = c(0.271829278733367, 0.517232771669986,
0.153774052052871, NA, NA, NA), `3_9` = c(0.141017619508583,
1.41279969394534, 0.651948154271122, NA, NA, NA), `3_10` = c(NA,
1.64435171100405, 0.998807430240956, NA, NA, NA), `3_11` = c(0.110046035477971,
1.33684444261939, 1.25595310581771, NA, NA, 0.0236163735479745
), `3_12` = c(NA, 0.982250906830292, 0.39283619985401, NA, NA,
0.0688303458902568), `3_13` = c(0.136798076436642, 0.55729642483448,
0.176525038283566, NA, NA, 0.0251189412372225), `3_14` = c(0.316623893146817,
1, NA, NA, NA, 0.0727823461722849), `3_15` = c(NA, 0.607991038574375,
NA, NA, NA, 0.133968257432001), `3_16` = c(0.362994392402489,
0.547183167896534, NA, NA, NA, 0.0777347708647245), `3_17` = c(1,
0.116561118715651, NA, NA, 0.710972173471528, 1), `3_18` = c(NA,
3.63330458071475, NA, NA, NA, 3.24019081192985), `3_19` = c(NA,
NA, NA, NA, NA, 2.46635222132474), `3_20` = c(0.452303676849426,
0.0896715384025126, NA, NA, 1, 1), `3_21` = c(1.50169299468485,
0.513442106966708, NA, NA, 1.45124841710635, 1.02529618467026
), `3_22` = c(0.565232592993276, 0.748536315065533, NA, NA, 2.9089322117881,
0.782555457293307), `3_23` = c(1.62622280168665, 0.704926586534075,
NA, NA, NA, 0.584486806995139), `3_24` = c(NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_)), .Names = c("Name",
"3_1", "3_2", "3_3", "3_4", "3_5", "3_6", "3_7", "3_8", "3_9",
"3_10", "3_11", "3_12", "3_13", "3_14", "3_15", "3_16", "3_17",
"3_18", "3_19", "3_20", "3_21", "3_22", "3_23", "3_24"), row.names = c(NA,
6L), class = "data.frame")
I used function below to bind them. There are different number of rows in each data frame and in some cases different names so in the final table should be more rows than in each of them.
mylist <- list(tbl_gel1,tbl_gel2,tbl_gel3)
tbl_all <- Reduce(function(x, y) merge(x, y, all=T,by="Name",sort=F),
mylist, accumulate=F)
Everything goes fine until this moment.
Now I want to calculate the mean of each fraction (there is 24 fractions in total)
## Calculating the mean
tbl_all1 <- tbl_all[-1]
ind <- c(1, 25, 49)
tbl_mean <- cbind(tbl_all[1], sapply(0:23, function(i) rowMeans(tbl_all1[ind+i])))
There is something wrong with that function because sum of many rows gives 0. That's definitely wrong because in tbl_gel1 and others are only rows with atleast one number in any fraction.
If I take a look on tbl_mean I see that rows with sum of 0 are in the bottom.