I have a dataset df1 like so:
snp <- c("rs7513574_T", "rs1627238_A", "rs1171278_C")
p.value <- c(2.635489e-01, 9.836280e-01 , 6.315047e-01 )
df1 <- data.frame(snp, p.value)
I want to remove the _ underscore and the letters after it (representing allele) in df1 and make this into a new dataframe df2
I tried this using the code
df2 <- df1[,c("snp", "allele"):=tstrsplit(`snp`, "_", fixed = TRUE)]
However, this changes the df1 data frame. Is there another way to do this?
This is my best guess as to what you want:
library(tidyr)
separate(df1, snp, into = c("snp", "allele"), sep = "_")
# snp allele p.value
# 1 rs7513574 T 0.2635489
# 2 rs1627238 A 0.9836280
# 3 rs1171278 C 0.6315047
df2 = df1 %>%
dplyr::mutate(across(c(V1, V2, V3), ~stringr::str_remove_all(., "_[:alpha:]")))
> df2
V1 V2 V3
snp rs7513574 rs1627238 rs1171278
p.value 0.2635489 0.983628 0.6315047
Try:
df2 <- df1 %>% mutate(snp=gsub("_.","",snp))
Consider creating a copy of the dataset and do the tstrsplit on the copied data to avoid changes in original data
library(data.table)
df2 <- copy(df1)
setDT(df2)[,c("snp", "allele") := tstrsplit(snp, "_", fixed = TRUE)]
Related
hello I have a df called df and I have subsetted it in another df called df1. Now I'd like to remove df1 rows from df to obtain a df2 = df - df1. How I can do it on R?
df <- read.csv("dataframe.csv")
df1 <- df[(df$time <= 0.345),]
Try:
df2 <- df[(df$time > 0.345), ]
or
df2 <- df[-which(df$time <= 0.345), ]
If for any reason you strictly have to keep the structure described, this is a possible approach:
df = data.frame(Sample.Name = c(12,13,14,12,13),
Target=c("A","B","C","A","A"),
Task=c("Sample","Standard","Sample","Standard","Sample"),
Value=c(36,34,34,35,36),
Mean=c(35,32,36,37,35))
df1 = df[(df$Value <= 34),]
df2 = df[do.call(paste0, df) %in% do.call(paste0, df1),]
df2
The result is this one:
Sample.Name Target Task Value Mean
2 13 B Standard 34 32
3 14 C Sample 34 36
This should work without even knowing the logic of first subset
library (dplyr)
df2 <- setdiff(df, df1)
OR
df2 <- anti_join(df, df1)
I have a data frame df with 7 columns and I have a list z containing multiple strings.
I want a dataframe containing only the columns in df which contain the sting from z.
df <- data.frame("a_means","b_means","c_means","d_means","e_mean","f_means","g_means")
z <- c("a_m","c_m","f_m")
How do I get the column number of the z strings in df? Or how do I get a dataframe with only the columns which contains the z strings.
What I want is:
print(df)
"a_means" "c_m" "f_m"
What I tried:
match(a, names(df)
and
df[,which(colnames(df) %in% colnames(df[ ,grepl(z,names(df)])]
You can use:
df[,match(z, substring(colnames(df), 1, 3))]
With base R:
z <- paste(z, collapse = "|")
df[, grepl(z, names(df))] # you could use grep as well
Combine the search patterns and use that as a pattern for stringr::str_detect() function.
library(dplyr)
library(stringr)
df <- data.frame(a_means = "a_means",
b_means = "b_means",
c_means = "c_means",
d_means = "d_means",
e_means = "e_means",
f_means = "f_means",
g_means = "g_means"
)
z <- c("a_m","c_m","f_m")
z <- paste(z, collapse = "|")
df %>% select_if(str_detect(names(df), z))
#> a_means c_means f_means
#> 1 a_means c_means f_means
You can simply do this:
library(dplyr)
df %>%
select(contains(z))
Check out help("starts_with"). You can also match to a starting prefix with starts_with() among other things.
You can use select and matches to subest the columns based on z
library(dplyr)
df <- data.frame("a_means","b_means","c_means","d_means","e_mean","f_means","g_means")
z <- c("a_m","c_m","f_m")
df %>%
select(matches(z))
#> X.a_means. X.c_means. X.f_means.
#> 1 a_means c_means f_means
When using the various join functions from dplyr you can either join all variables with the same name (by default) or specify those ones using by = c("a" = "b"). Is there a way to join by exclusion? For example, I have 1000 variables in two data frames and I want to join them by 999 of them, leaving one out. I don't want to do by = c("a1" = "b1", ...,"a999" = "b999"). Is there a way to join by excluding the one variable that is not used?
Ok, using this example from one answer:
set.seed(24)
df1 <- data_frame(alala= LETTERS[1:3], skks= letters[1:3], sskjs=
letters[1:3], val = rnorm(3))
df2 <- data_frame(alala= LETTERS[1:3], skks= letters[1:3], sskjs=
letters[1:3], val = rnorm(3))
I want to join them using all variables excluding val. I'm looking for a more general solution. Assuming there are 1000 variables and I only remember the name of the one that I want to exclude in the join, while not knowing the index of that variable. How can I perform the join while only knowing the variable names to exclude. I understand I can find the column index first but is there a simply way to add exclusions in by =?
We create a named vector to do this
library(dplyr)
grps <- setNames(paste0("b", 1:999), paste0("a", 1:999))
Note the 'grps' vector is created with paste as the OP's post suggested a pattern. If there is no pattern, but we know the column that is not to be grouped
nogroupColumn <- "someColumn"
grps <- setNames(setdiff(names(df1), nogroupColumn),
setdiff(names(df2), nogroupColumn))
inner_join(df1, df2, by = grps)
Using a reproducible example
set.seed(24)
df1 <- data_frame(a1 = LETTERS[1:3], a2 = letters[1:3], val = rnorm(3))
df2 <- data_frame(b1 = LETTERS[3:4], b2 = letters[3:4], valn = rnorm(2))
grps <- setNames(paste0("b", 1:2), paste0("a", 1:2))
inner_join(df1, df2, by = grps)
# A tibble: 1 x 4
# a1 a2 val valn
# <chr> <chr> <dbl> <dbl>
#1 C c 0.420 -0.584
To exclude a certain field(s), you need to identify the index of the columns you want. Here's one way:
which(!names(df1) %in% "sskjs" ) #<this excludes the column "sskjs"
[1] 1 2 4 #<and shows only the desired index columns
Use unite to create a join_id in each dataframe, and join by it.
df1 <- df1 %>%
unite(join_id, which(!names(.) %in% "sskjs"), remove = F)
df2 <- df2 %>%
unite(join_id, which(!names(.) %in% "sskjs"), remove = F)
left_join(df1, df2, by = "join_id" )
I have a data frame with a number of columns in a form var1.mean, var2.mean. I would like to strip the suffix ".mean" from all columns that contain it. I tried using rename_all in conjunction with regex in a pipe but could not come up with a correct syntax. Any suggestions?
If you want to use the dplyr package, I'd recommend using the rename_at function.
Dframe <- data.frame(var1.mean = rnorm(10),
var2.mean = rnorm(10),
var1.sd = runif(10))
library(dplyr)
Dframe %>%
rename_at(.vars = vars(ends_with(".mean")),
.funs = funs(sub("[.]mean$", "", .)))
Using new dplyr:
df %>% rename_with(~str_remove(., '.mean'))
We can use rename_all
df1 %>%
rename_all(.funs = funs(sub("\\..*", "", names(df1)))) %>%
head(2)
# var1 var2 var3 var1 var2 var3
#1 -0.5458808 -0.09411013 0.5266526 -1.3546636 0.08314367 0.5916817
#2 0.5365853 -0.08554095 -1.0736261 -0.9608088 2.78494703 -0.2883407
NOTE: If the column names are duplicated, it needs to be made unique with make.unique
data
set.seed(24)
df1 <- as.data.frame(matrix(rnorm(25*6), 25, 6, dimnames = list(NULL,
paste0(paste0("var", 1:3), rep(c(".mean", ".sd"), each = 3)))))
You may use gsub.
colnames(df) <- gsub('.mean','',colnames(df))
The below works for me
dat <- data.frame(var1.mean = 1, var2.mean = 2)
col_old <- colnames(dat)
col_new <- gsub(pattern = ".mean",replacement = "", x = col_old)
colnames(dat) <- col_new
You can replace this names using stringi package stri_replace_last_regex function like this:
require(stringi)
df <- data.frame(1,2,3,4,5,6)
names(df) <- stri_paste("var",1:6,c(".mean",".sd"))
df
## var1.mean var2.sd var3.mean var4.sd var5.mean var6.sd
##1 1 2 3 4 5 6
names(df) <- stri_replace_last_regex(names(df),"\\.mean$","")
df
## var1 var2.sd var3 var4.sd var5 var6.sd
##1 1 2 3 4 5 6
The regex is \\.mean$ because you need to escape dot character (it has special meaning in regex) and also you can add $ sign at the end to ensure that you replace only names that ENDS with this pattern (if the .mean text is in the middle of string then it wan't be replaced).
I would use stringsplit:
x <- as.data.frame(matrix(runif(16), ncol = 4))
colnames(x) <- c("var1.mean", "var2.mean", "var3.mean", "something.else")
colnames(x) <- strsplit(colnames(x), split = ".mean")
colnames(x)
Lot's of quick answers have been given, the most intuitive, to me would be:
Dframe <- data.frame(var1.mean = rnorm(10), #Create Example
var2.mean = rnorm(10),
var1.sd = runif(10))
names(Dframe) <- gsub("[.]mean","",names(Dframe)) #remove ".mean"
This may be a bad question because I am not posting any reproducible example. My main goal is to identify columns that are of different types between two dataframe that have the same column names.
For example
df1
Id Col1 Col2 Col3
Numeric Factor Integer Date
df2
Id Col1 Col2 Col3
Numeric Numeric Integer Date
Here both the dataframes (df1, df2) have same column names but the Col1 type is different and I am interested in identifying such columns. Expected output.
Col1 Factor Numeric
Any suggestions or tips on achieving this ?. Thanks
Try compare_df_cols() from the janitor package:
library(janitor)
mtcars2 <- mtcars
mtcars2$cyl <- as.character(mtcars2$cyl)
compare_df_cols(mtcars, mtcars2, return = "mismatch")
#> column_name mtcars mtcars2
#> 1 cyl numeric character
Self-promotion alert, I authored this package - am posting this function because it exists to solve precisely this problem.
Try this:
compareColumns <- function(df1, df2) {
commonNames <- names(df1)[names(df1) %in% names(df2)]
data.frame(Column = commonNames,
df1 = sapply(df1[,commonNames], class),
df2 = sapply(df2[,commonNames], class)) }
For a more compact method, you could use a list with sapply(). Efficiency shouldn't be a problem here since all we're doing is grabbing the class. Here I add data frame names to the list to create a more clear output.
m <- sapply(list(df1 = df1, df2 = df2), sapply, class)
m[m[, "df1"] != m[, "df2"], , drop = FALSE]
# df1 df2
# Col1 "factor" "character"
where df1 and df2 are the data from #ycw's answer.
If two data frame have same column names, then below will give you columns with different classes.
library(dplyr)
m1 = mtcars
m2 = mtcars %>% mutate(cyl = factor(cyl), vs = factor(cyl))
out = cbind(sapply(m1, class), sapply(m2, class))
out[apply(out, 1, function(x) !identical(x[1], x[2])), ]
We can use sapply with class to loop through all columns in df1 and df2. After that, we can compare the results.
# Create example data frames
df1 <- data.frame(ID = 1:3,
Col1 = as.character(2:4),
Col2 = 2:4,
Col3 = as.Date(paste0("2017-01-0", 2:4)))
df2 <- data.frame(ID = 1:3,
Col1 = as.character(2:4),
Col2 = 2:4,
Col3 = as.Date(paste0("2017-01-0", 2:4)),
stringsAsFactors = FALSE)
# Use sapply and class to find out all the class
class1 <- sapply(df1, class)
class2 <- sapply(df2, class)
# Combine the results, then filter for rows that are different
result <- data.frame(class1, class2, stringsAsFactors = FALSE)
result[!(result$class1 == result$class2), ]
class1 class2
Col1 factor character