my input data is
df
anger sad joy happy trust disgust
1 1 0 1 2 3 0
2 2 0 0 2 0 3
3 2 2 1 1 1 1
4 0 1 1 1 0 1
I want output like this
mydata
anger sad joy happy trust disgust col
1 1 0 1 2 3 0 trust
2 2 0 0 2 0 3 disgust
I want to extract max value colname from each row but output only those rows having only one max value colname and discard all other row with more than one colname.
i tried this
d1 <- df[!apply(df[-1], 1, function(x) anyDuplicated(x[x == max(x)])),]
but i am getting this
anger sad joy happy trust disgust
1 1 0 1 2 3 0
2 2 0 0 2 0 3
3 2 2 1 1 1 1
I don't want third row in the output.
Thanks for help in advance.
We can use max.col to get the index of columns for each row after subsetting the rows
d1 <- mydata[!apply(mydata[-1], 1, anyDuplicated),]
d1$out <- names(d1)[-1][max.col(d1[-1], 'first')]
d1
# zone_id v1 v2 v3 v4 out
#1 1 12 15 18 20 v4
#3 3 31 28 14 2 v1
#4 4 12 16 9 5 v2
#5 5 5 18 10 12 v2
Update
If the OP wanted to remove only the duplicate values of max values, then replace the first line with
d1 <- mydata[!apply(mydata[-1], 1, function(x) anyDuplicated(x[x == max(x)])),]
Update2
Based on the newdataset by the OP, we don't need to remove the first column as it is not an id column
d2 <- mydata1[!apply(mydata1, 1, function(x) anyDuplicated(x[x == max(x)])),]
d2$out <- names(d2)[max.col(d2, 'first')]
d2
# anger sad joy happy trust disgust out
#1 1 0 1 2 3 0 trust
#2 2 0 0 2 0 3 disgust
data
mydata1 <- structure(list(anger = c(1L, 2L, 2L, 0L), sad = c(0L, 0L, 2L,
1L), joy = c(1L, 0L, 1L, 1L), happy = c(2L, 2L, 1L, 1L), trust = c(3L,
0L, 1L, 0L), disgust = c(0L, 3L, 1L, 1L)), .Names = c("anger", "sad",
"joy", "happy", "trust", "disgust"), row.names = c(NA, 4L),
class = "data.frame")
you can try:
mydata %>%
select(-zone_id) %>%
mutate(mx = do.call(pmax, (.))) %>%
select(mx) %>%
cbind(mydata) %>%
mutate( flg = rowSums(. == mx)) %>%
filter(flg ==2) %>%
select(-flg) %>%
gather(key = out, value= v, -mx, -zone_id) %>%
filter(mx == v) %>%
select(zone_id, mx, out) %>%
left_join(mydata)
which gives:
zone_id mx out v1 v2 v3 v4
1 3 31 v1 31 28 2 2
2 4 16 v2 1 16 9 1
3 5 18 v2 5 18 10 12
4 1 20 v4 12 15 18 20
Related
I have two data frames
df1:
01.2020 02.2020 03.2020
11190 4 1 2
12345 3 3 1
11323 1 2 2
df2
08.2020 04.2020 09.2020
11190 1 2 2
12345 1 2 3
11324 1 2 2
Dummy Data -
df1 <- structure(list(`01.2020` = c(4L, 3L, 1L), `02.2020` = c(1L, 3L, 2L), `03.2020` = c(2L, 1L, 2L)), class = "data.frame", row.names = c("11190","12345", "11323"))
df2 <- structure(list(`08.2020` = c(1L, 1L, 1L), `04.2020` = c(2L, 2L, 2L), `09.2020` = c(2L, 3L, 2L)), class = "data.frame", row.names = c("11190", "12345", "11324"))
I want to "outer merge" these two dataframes by key = index
How can we do that? what should be there in the place of by=
merge(x = sheet1_UN, y = sheet2_UN, by = "" , all = TRUE)
I want my final dataframe to look something like this
01.2020 02.2020 03.2020 08.2020 04.2020 09.2020
11190 4 1 2 1 1 2
12345 3 3 1 1 2 3
11323 1 2 2 - - -
11324 - - - 1 2 2
Thanks in advance.
another method
df3 <- merge(df1, df2, by = "row.names", all = TRUE)
output:
Row.names 01.2020 02.2020 03.2020 08.2020 04.2020 09.2020
1 11190 4 1 2 1 2 2
2 11323 1 2 2 NA NA NA
3 11324 NA NA NA 1 2 2
4 12345 3 3 1 1 2 3
This should do:
df1 %>% rownames_to_column('id') %>%
full_join(df2 %>% rownames_to_column('id'), by='id')
output:
id 01.2020 02.2020 03.2020 08.2020 04.2020 09.2020
1 11190 4 1 2 1 2 2
2 12345 3 3 1 1 2 3
3 11323 1 2 2 NA NA NA
4 11324 NA NA NA 1 2 2
You might use replace_na('-') if you want no NA values, like this:
df1 %>% rownames_to_column('id') %>%
full_join(df2 %>% rownames_to_column('id'), by='id') %>%
mutate(across(everything(), ~.x %>% as.character %>% replace_na('-')))
I have a question for the community and hoping for some help.
I am trying to duplicate a data frame like the one below:
ID Time Solve
1 0 1
1 2 2
1 4 3
1 6 1
I am trying to duplicate the above data frame 100 times so, it would read as below:
ID Time Solve
1 0 1
1 2 2
1 4 3
1 6 1
2 0 1
2 2 2
2 4 3
2 6 1
3 0 1
3 2 2
3 4 3
3 6 1
4 0 1
4 2 2
4 4 3
4 6 1
.....
100 0 1
100 2 2
100 4 3
100 6 1
Does anyone have a good solution for this or a resource to read up on this?
Thanks!
We can use replicate
out <- do.call(rbind, replicate(100, df1, simplify = FALSE))
out$ID <- as.integer(gl(nrow(out), nrow(df1), nrow(out)))
Or another option is rep
out <- df1[rep(seq_len(nrow(df1)), 100),]
out$ID <- as.integer(gl(nrow(out), nrow(df1), nrow(out)))
Or make use of uncount
library(tidyr)
library(dplyr)
uncount(df1, 100) %>%
mutate(ID = as.integer(gl(n(), nrow(df1), n()))
Or another option is
df1 %>%
nest_by(ID) %>%
uncount(100) %>%
mutate(ID = row_number()) %>%
unnest(c(data))
data
df1 <- structure(list(ID = c(1L, 1L, 1L, 1L), Time = c(0L, 2L, 4L, 6L
), Solve = c(1L, 2L, 3L, 1L)), class = "data.frame", row.names = c(NA,
-4L))
I have say this dataframe:
gene0 1 2 3
gene1 0 0 5
gene2 6 8 0
gene3 5 5 5
0 0 5
1 2 3
and I would like to associate numbers from "unnamed" columns with genes to have the following:
gene0 1 2 3
gene1 0 0 5
gene2 6 8 0
gene3 5 5 5
gene1 0 0 5
gene0 1 2 3
What is the best way to do it? Do I need to use linux or R for that?
One dplyr and tidyr option could be:
df %>%
group_by_at(-1) %>%
fill(V1)
V1 V2 V3 V4
<chr> <int> <int> <int>
1 gene0 1 2 3
2 gene1 0 0 5
3 gene2 6 8 0
4 gene3 5 5 5
5 gene1 0 0 5
6 gene0 1 2 3
Or:
df %>%
group_by(group = group_indices(., !!!select(., -1))) %>%
fill(V1) %>%
ungroup() %>%
select(-group)
Sample data:
df <- read.table(text = "gene0 1 2 3
gene1 0 0 5
gene2 6 8 0
gene3 5 5 5
NA 0 0 5
NA 1 2 3",
header = FALSE,
na.strings = "NA",
stringsAsFactors = FALSE)
We can use match from base R
a1 <- do.call(paste, df1[-1])
df1$V1 <- df1$V1[match(a1, unique(a1))]
df1$V1
#[1] "gene0" "gene1" "gene2" "gene3" "gene1" "gene0"
Update
Using the OP's dataset
df1 <- read.csv("newest.csv", stringsAsFactors = FALSE)
df1$id[df1$id == ""] <- NA
a1 <- do.call(paste, df1[-1])
df1$id <- df1$id[match(a1, unique(a1))]
length(unique(df1$id))
#[1] 621
head(df1$id, 20)
#[1] "pop13_110" "pop1_2" "pop16_108" "pop2_10" "pop2_2" "pop2_3" "pop2_4" "pop2_5" "pop2_6" "pop2_7" "pop2_8"
#[12] "pop2_9" "pop2_10" "pop2_11" "pop7_81" "pop2_13" "pop2_15" "pop2_15" "pop2_16" "pop22_20"
tail(df1$id, 20)
# [1] "pop22_2" "pop22_3" "pop22_4" "pop22_5" "pop22_8" "pop22_9" "pop13_60" "pop16_131" "pop23_11" "pop22_25" "pop22"
#[12] "pop22_14" "pop22_15" "pop22_32" "pop22_28" "pop16_56" "pop22_18" "pop9_9" "pop22_21" "pop22_22"
data
df1 <- structure(list(V1 = c("gene0", "gene1", "gene2", "gene3", NA,
NA), V2 = c(1L, 0L, 6L, 5L, 0L, 1L), V3 = c(2L, 0L, 8L, 5L, 0L,
2L), V4 = c(3L, 5L, 0L, 5L, 5L, 3L)), class = "data.frame",
row.names = c(NA,
-6L))
The naive solution
library(tidyverse)
df <- tribble(~col1,~col2,~col3,
1,2,3,
0,0,5,
6,8,0,
5,5,5,
0,0,5,
1,2,3,
1,1,1)
df %>%
mutate(gene = case_when(col1 == 1 & col2 == 2 &col3 == 3 ~ "gene0",
col1 == 0 & col2 == 0 &col3 == 5 ~ "gene1",
col1 == 6 & col2 == 8 &col3 == 0 ~ "gene2",
col1 == 5 & col2 == 5 &col3 == 5 ~ "gene3",
TRUE ~ "unkown_gene"))
Another much more extensible option is to create a tabble with gene definitions(can even be imported from excel or something)
df1 <- tribble(~gene,~col1,~col2,~col3,
'gene0',1,2,3,
'gene1',0,0,5,
'gene2',6,8,0,
'gene3',5,5,5)
and simply join the new observation on it
df %>%
left_join(df1)
Here is another solution with base R in addition to the solution by #akrun, where match() is used over rows that cross from columns from V2 to V4
df$V1[which(is.na(df$V1))] <- df$V1[match(data.frame(t(subset(df,is.na(df$V1))[-1])),
data.frame(t(subset(df,is.na(df$V1))[-1])))]
such that
> df
V1 V2 V3 V4
1 gene0 1 2 3
2 gene1 0 0 5
3 gene2 6 8 0
4 gene3 5 5 5
5 gene0 0 0 5
6 gene1 1 2 3
I have this df:
> df <- data.frame(Adults = sample(0:5, 10, replace = TRUE),
+ Children = sample(0:2, 10, replace = TRUE),
+ Teens = sample(1:3, 10, replace = TRUE),
+ stringsAsFactors = FALSE)
> df
Adults Children Teens
1 5 0 1
2 5 1 2
3 5 2 3
4 5 2 2
5 0 1 2
6 5 1 3
7 0 2 3
8 4 2 1
9 4 0 1
10 1 2 1
We can see that Children doesn't have 3,4,5 values and Teens doesn't have 0,4,5 values. However, we know that Adults, Children, and Teens could have from 0 to 5.
When I use group_by() with summarise(), summarise drops the columns I'm not grouping. The code:
df %>%
group_by(Adults) %>% mutate(n_Adults = n()) %>%
group_by(Teens) %>% mutate(n_Teens = n()) %>%
group_by(Children) %>% mutate(n_Children = n())
And when I group by c(0,1,2,3,4,5) (in order to have all the possible values) it gives me this error:
Error in mutate_impl(.data, dots) : Column `c(0, 1, 2, 3, 4, 5)` must be length 10 (the number of rows) or one, not 6
I'm looking for this output:
Values n_Adults n_Children n_Teens p_Adults p_Children p_Teens
0 2 2 0 0.2 0.2 0
1 1 3 4 0.1 0.1 0.4
2 0 5 3 0 0 0.3
3 0 0 3 0 0 0.3
4 2 0 0 0.2 0.2 0
5 5 0 0 0.5 0.5 0
Where n_ is the count of the respective column and p_ is the percentage of the respective column.
We can gather the data into 'long' format, get the frequency with count after converting the 'value' to factor with levels specified as 0:5, spread to 'wide' format and create the 'p' columns by dividing with the sum of each column and if needed change the column name (with rename_at)
library(tidyverse)
gather(df) %>%
count(key, value = factor(value, levels = 0:5)) %>%
spread(key, n, fill = 0) %>%
mutate_at(2:4, list(p = ~./sum(.)))%>%
rename_at(2:4, ~ paste0(.x, "_n"))
data
df <- structure(list(Adults = c(1L, 1L, 4L, 3L, 3L, 5L, 1L, 4L, 4L,
1L), Children = c(1L, 1L, 2L, 2L, 0L, 2L, 0L, 0L, 1L, 0L), Teens = c(1L,
2L, 3L, 1L, 1L, 3L, 1L, 2L, 2L, 1L)), class = "data.frame", row.names = c(NA,
-10L))
library(reprex)
library(tidyverse)
set.seed(20)
df <- data.frame(Adults = sample(0:5, 10, replace = TRUE),
Children = sample(0:2, 10, replace = TRUE),
Teens = sample(1:3, 10, replace = TRUE),
stringsAsFactors = FALSE)
df
#> Adults Children Teens
#> 1 5 2 2
#> 2 4 2 1
#> 3 1 0 2
#> 4 3 2 1
#> 5 5 0 1
#> 6 5 1 1
#> 7 0 0 3
#> 8 0 0 3
#> 9 1 0 1
#> 10 2 2 3
df_adults <- df %>%
count(Adults) %>%
rename( n_Adults = n)
df_childred <- df %>%
count(Children) %>%
rename( n_Children = n)
df_teens <- df %>%
count(Teens) %>%
rename( n_Teens = n)
df_new <- data.frame(unique_id = 0:5)
df_new <- left_join(df_new,df_adults, by = c("unique_id"="Adults"))
df_new <- left_join(df_new,df_childred, by = c("unique_id"="Children"))
df_new <- left_join(df_new,df_teens, by = c("unique_id"="Teens"))
df_new <- df_new %>%
replace_na(list( n_Adults=0, n_Children=0, n_Teens=0))
df_new %>%
mutate(p_Adults = n_Adults/sum(n_Adults),p_Children = n_Children/sum(n_Children), p_Teens = n_Teens/sum(n_Teens))
#> unique_id n_Adults n_Children n_Teens p_Adults p_Children p_Teens
#> 1 0 2 5 0 0.2 0.5 0.0
#> 2 1 2 1 5 0.2 0.1 0.5
#> 3 2 1 4 2 0.1 0.4 0.2
#> 4 3 1 0 3 0.1 0.0 0.3
#> 5 4 1 0 0 0.1 0.0 0.0
#> 6 5 3 0 0 0.3 0.0 0.0
Created on 2019-02-25 by the reprex package (v0.2.1)
I have 200 columns and want to calculate mean and rank and then generate columns. Here is an example of data
df<-read.table(text="Q1a Q2a Q3b Q4c Q5a Q6c Q7b
1 2 4 2 2 0 1
3 2 1 2 2 1 1
4 3 2 1 1 1 1",h=T)
I want to sum a, b and c for each row, and then sum them together. Next I want to calculate the rank for each row. I want to generate the following table:
Q1a Q2a Q3b Q4c Q5a Q6c Q7b a b c Total Rank
1 2 4 2 2 0 1 5 5 2 12 2
3 2 1 2 2 1 1 7 2 3 12 2
4 3 2 1 1 1 1 8 3 2 13 1
library(dplyr)
df %>%
cbind(sapply(c('a', 'b', 'c'), function(x) rowSums(.[, grep(x, names(.)), drop=FALSE]))) %>%
mutate(Total = a + b + c,
Rank = match(Total, sort(Total, decreasing = T)))
Output is:
Q1a Q2a Q3b Q4c Q5a Q6c Q7b a b c Total Rank
1 1 2 4 2 2 0 1 5 5 2 12 2
2 3 2 1 2 2 1 1 7 2 3 12 2
3 4 3 2 1 1 1 1 8 3 2 13 1
Sample data:
df <- structure(list(Q1a = c(1L, 3L, 4L), Q2a = c(2L, 2L, 3L), Q3b = c(4L,
1L, 2L), Q4c = c(2L, 2L, 1L), Q5a = c(2L, 2L, 1L), Q6c = c(0L,
1L, 1L), Q7b = c(1L, 1L, 1L)), class = "data.frame", row.names = c(NA,
-3L))
You can also go with the tidyverse approach. However, it is longer.
library(tidyverse)
df %>%
rownames_to_column(var = "ID") %>%
gather(question, value, -ID) %>%
mutate(type = substr(question, 3,3)) %>%
group_by(ID, type) %>%
summarise(sumType = sum(value, na.rm = TRUE)) %>%
as.data.frame() %>%
spread(type, sumType) %>%
mutate(Total = a+b+c,
Rank = match(Total, sort(Total, decreasing = T)))
Results:
ID a b c Total Rank
1 1 5 5 2 12 2
2 2 7 2 3 12 2
3 3 8 3 2 13 1