Remove cases with a specific value in R - r

I want to remove all rows with value 9999 in my dataset.
Here is an example dataset:
df <- data.frame(a=c(1, 3, 4, 6, 9999, 9),
b=c(7, 8, 8, 7, 13, 16),
c=c(11, 13, 9999, 18, 19, 22),
d=c(12, 16, 18, 22, 29, 38))
So the final dataset will only contain row 1, 2, 4 and 6. I have a large dataset and so how to do this without specifying the names of all columns? Thank you!

You could do:
df[which(apply(df, 1, \(i) !any(i == 9999))),]
#> a b c d
#> 1 1 7 11 12
#> 2 3 8 13 16
#> 4 6 7 18 22
#> 6 9 16 22 38

df[-which(df == 9999, TRUE)[,1], ]
a b c d
1 1 7 11 12
2 3 8 13 16
4 6 7 18 22
6 9 16 22 38

An option with dplyr
library(dplyr)
df %>%
filter(!if_any(everything(), ~ . == 9999))
-output
a b c d
1 1 7 11 12
2 3 8 13 16
3 6 7 18 22
4 9 16 22 38
Or with across
df %>%
filter(across(everything(), ~ . != 9999))

Related

create a new column with the dupicated values in both columns in R

I inputted my file in R and obtained as data.frame.
x <- read_xlsx ("C:/Users/gtutk/Desktop/example.xlsx")
df <- data.frame (x)
x y
1 1 1
2 2 2
3 3 3
4 4 5
5 5 6
6 6 7
7 7 9
8 8 10
9 9 11
10 10 13
11 11 14
12 12 15
13 13 17
14 14 18
15 15 19
16 16 14
17 17 15
18 18 17
19 19 18
20 12 19
21 13 20
22 14 21
23 15 22
24 16 23
25 17 24
26 18 25
27 19 26
I want to creat a new merged excel file with duplicated values in two columns. Or removing the unique values existing in both two columns and finally getting only duplicated values in both two columns.
Expected file:
new
1 1
2 2
3 3
4 5
5 6
6 7
7 9
8 10
9 11
10 13
11 14
12 15
13 17
14 18
15 19
# If you have exactly two columns:
data.frame(new = do.call(intersect, df))
# For more than 2 columns:
data.frame(new = Reduce(intersect, df))
new
1 1
2 2
3 3
4 5
5 6
6 7
7 9
8 10
9 11
10 13
11 14
12 15
13 17
14 18
15 19
Reproducible input data:
data.frame(
x = c(
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 12, 13,
14, 15, 16, 17, 18, 19
),
y = c(
1, 2, 3, 5, 6, 7, 9, 10, 11, 13, 14, 15, 17, 18, 19, 14, 15, 17, 18, 19, 20,
21, 22, 23, 24, 25, 26
)
)
We can try the following code, which can deal with the case of more than 2 columns
> d <- table(stack(df))
> data.frame(new = as.numeric(row.names(d))[rowSums(d > 0) == length(df)])
new
1 1
2 2
3 3
4 5
5 6
6 7
7 9
8 10
9 11
10 13
11 14
12 15
13 17
14 18
15 19
I have added extra columns, z and w, to show how to use this approach when there is more than 2 columns:
library(tidyverse)
df <- structure(list(x = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
14, 15, 16, 17, 18, 19, 12, 13, 14, 15, 16, 17, 18, 19), y = c(1,
2, 3, 5, 6, 7, 9, 10, 11, 13, 14, 15, 17, 18, 19, 14, 15, 17,
18, 19, 20, 21, 22, 23, 24, 25, 26)), class = c("spec_tbl_df",
"tbl_df", "tbl", "data.frame"), row.names = c(NA, -27L), spec = structure(list(
cols = list(x = structure(list(), class = c("collector_double",
"collector")), y = structure(list(), class = c("collector_double",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), skip = 1L), class = "col_spec"))
df$z <- rep(12, nrow(df))
df$w <- rep(8, nrow(df))
df %>%
transmute(new = if_else(x %in% flatten(.[,-1]), x, NA_real_)) %>%
drop_na(new) %>% distinct
#> new
#> 1 1
#> 2 2
#> 3 3
#> 4 5
#> 5 6
#> 6 7
#> 7 8
#> 8 9
#> 9 10
#> 10 11
#> 11 12
#> 12 13
#> 13 14
#> 14 15
#> 15 17
#> 16 18
#> 17 19

Omit rows with 0 in many specific columns

I have a very wide dataset with multiple psychometric scales and I would like to remove rows if any of a handful of columns contains zero (i.e., a missing response).
I know how to do it when the data frame is small, but my method is not scalable. For example,
dftry <- data.frame(x = c(1, 2, 5, 3, 0), y = c(0, 10, 5, 3, 37), z=c(12, 0, 33, 22, 23))
x y z
1 1 0 12
2 2 10 0
3 5 5 33
4 3 3 22
5 0 37 23
# Remove row if it has 0 in y or z columns
# is there a difference between & and , ?
dftry %>% filter(dftry$y > 0 & dftry$z > 0)
x y z
1 5 5 33
2 3 3 22
3 0 37 23
In my actual data, I want to remove rows if there are zeroes in any of these columns:
# this is the most succinct way of selecting the columns in question
select(c(1:42, contains("BMIS"), "hamD", "GAD"))
You can use rowSums :
cols <- c('y', 'z')
dftry[rowSums(dftry[cols] == 0, na.rm = TRUE) == 0, ]
# x y z
#1 5 5 33
#2 3 3 22
#3 0 37 23
We can integrate this into dplyr for your real use-case.
library(dplyr)
dftry %>%
filter(rowSums(select(.,
c(1:42, contains("BMIS"), "hamD", "GAD")) == 0, na.rm = TRUE) == 0)
Does this work using dplyr:
> library(dplyr)
> dftry
x y z a b c BMIS_1 BMIS_3 hamD GAD m n
1 1 0 12 1 0 12 1 0 12 12 12 12
2 2 10 0 2 10 0 2 10 0 0 0 0
3 5 5 33 5 5 33 5 5 33 33 33 33
4 3 3 22 3 3 22 3 3 22 22 22 22
5 0 37 23 0 37 23 0 37 23 23 23 23
> dftry %>% select(c(1:3,contains('BMIS'), hamD, GAD)) %>% filter_all(all_vars(. != 0))
x y z BMIS_1 BMIS_3 hamD GAD
1 5 5 33 5 5 33 33
2 3 3 22 3 3 22 22
>
Data used:
> dftry
x y z a b c BMIS_1 BMIS_3 hamD GAD m n
1 1 0 12 1 0 12 1 0 12 12 12 12
2 2 10 0 2 10 0 2 10 0 0 0 0
3 5 5 33 5 5 33 5 5 33 33 33 33
4 3 3 22 3 3 22 3 3 22 22 22 22
5 0 37 23 0 37 23 0 37 23 23 23 23
> dput(dftry)
structure(list(x = c(1, 2, 5, 3, 0), y = c(0, 10, 5, 3, 37),
z = c(12, 0, 33, 22, 23), a = c(1, 2, 5, 3, 0), b = c(0,
10, 5, 3, 37), c = c(12, 0, 33, 22, 23), BMIS_1 = c(1, 2,
5, 3, 0), BMIS_3 = c(0, 10, 5, 3, 37), hamD = c(12, 0, 33,
22, 23), GAD = c(12, 0, 33, 22, 23), m = c(12, 0, 33, 22,
23), n = c(12, 0, 33, 22, 23)), class = "data.frame", row.names = c(NA,
-5L))
>

Replace several values and keep others same efficiently in R

I have a dataframe like the following:
combo_2 combo_4 combo_7 combo_9
12 23 14 17
21 32 41 71
2 3 1 7
1 2 4 1
21 23 14 71
2 32 1 7
Each column has two single-digit values and two double-digit values composed of the single-digit values in each possible order.
I am trying to determine how to replace certain values in the dataframe so that there is only one version of the double-digit value. For example, all values of 21 in the first column should be 12. All values of 32 in the second column should become 23.
I know I can do something like this using the following code:
df <- df %>%
mutate_at(vars(combo_2, combo_4, combo_7, combo_9), function(x)
case_when(x == 21 ~ 12, x == 32 ~ 23, x == 41 ~ 14, x == 71 ~ 17))
The problem with this is that it gives me a dataframe that contains the correct values when specified but leaves all the other values as NA. The resulting dataframe only contains values where 21, 32, 41, and 71 were. I know I could address this by specifying each value, like x == 1 ~ 1. However, I have many values and would prefer to only specify the ones that I am trying to change.
How can I replace several values in a dataframe without all the other values becoming NA? Is there a way for me to replace the values I want to replace while holding the other values the same without directly specifying those values?
You can use TRUE ~ x at the end of your case_when() sequence:
df %>%
mutate_at(vars(combo_2, combo_4, combo_7, combo_9), function(x)
case_when(x == 21 ~ 12, x == 32 ~ 23, x == 41 ~ 14, x == 71 ~ 17, TRUE ~ x))
combo_2 combo_4 combo_7 combo_9
1 12 23 14 17
2 12 23 14 17
3 2 3 1 7
4 1 2 4 1
5 12 23 14 17
6 2 23 1 7
Another option that may be more efficient would be data.table's fcase() function.
Data:
df = read.table(header = TRUE, text = "combo_2 combo_4 combo_7 combo_9
12 23 14 17
21 32 41 71
2 3 1 7
1 2 4 1
21 23 14 71
2 32 1 7")
df[] = lapply(df, as.double) # side-note: tidyverse has become very stict about types
One dplyr and stringi option may be:
df %>%
mutate(across(everything(),
~ if_else(. %in% c(21, 32, 41, 71), as.integer(stri_reverse(.)), .)))
combo_2 combo_4 combo_7 combo_9
1 12 23 14 17
2 12 23 14 17
3 2 3 1 7
4 1 2 4 1
5 12 23 14 17
6 2 23 1 7
Using mapply:
df1[] <- mapply(function(d, x1, x2){ ifelse(d == x1, x2, d) },
d = df1,
x1 = c(21, 32, 41, 71),
x2 = c(12, 23, 14, 17))
df1
# combo_2 combo_4 combo_7 combo_9
# 1 12 23 14 17
# 2 12 23 14 17
# 3 2 3 1 7
# 4 1 2 4 1
# 5 12 23 14 17
# 6 2 23 1 7

Expand data.frame using vector of values present in one column in R

I have a dataframe that looks something like this:
df <- data.frame(entrez = c(1:10, 1), entrez_HS = c(11:19, 19, 20))
entrez entrez_HS
1 1 11
2 2 12
3 3 13
4 4 14
5 5 15
6 6 16
7 7 17
8 8 18
9 9 19
10 10 19
11 1 20
I also have vector of values that exist in df$entrez_HS:
entrez_HS <- c(11, 11, 12, 19, 19)
For every value in entrez_HS, I want the row(s) of df where df$entrez_HS is equal to the value. Duplicate entries in entrez_HS should result in duplicated rows. Here is the result that I would expect for the above df:
entrez entrez_HS
1 1 11
2 1 11
3 2 12
4 9 19
5 10 19
6 9 19
7 10 19
Not sure how to approach this? Thank you
merge the data together:
merge(mget("entrez_HS"), df, by="entrez_HS")
#or
merge(data.frame(entrez_HS), df, by="entrez_HS")
# entrez_HS entrez
#1 11 1
#2 11 1
#3 12 2
#4 19 9
#5 19 10
#6 19 9
#7 19 10
Without using any package, we can try this:
# Create data
df <- data.frame(entrez = c(1:10, 1), entrez_HS = c(11:19, 19, 20))
entrez_HS <- c(11, 11, 12, 19, 19)
# Extract information, then collect it
result <- lapply(entrez_HS, function(i) df[df$entrez_HS==i,])
result <- do.call("rbind", result)
Here is another option
rbind(df[match(entrez_HS, df$entrez_HS),],
df[duplicated(df$entrez_HS)|duplicated(df$entrez_HS,
fromLast=TRUE),])
# entrez entrez_HS
#1 1 11
#1.1 1 11
#2 2 12
#9 9 19
#9.1 9 19
#91 9 19
#10 10 19
Or using dplyr
library(dplyr)
left_join(data_frame(entrez_HS), df)
# entrez_HS entrez
# <dbl> <dbl>
#1 11 1
#2 11 1
#3 12 2
#4 19 9
#5 19 10
#6 19 9
#7 19 10

dplyr- renaming sequence of columns with select function

I'm trying to rename my columns in dplyr. I found that doing it with select function. however when I try to rename some selected columns with sequence I cannot rename them the format that I want.
test = data.frame(x = rep(1:3, each = 2),
group =rep(c("Group 1","Group 2"),3),
y1=c(22,8,11,4,7,5),
y2=c(22,18,21,14,17,15),
y3=c(23,18,51,44,27,35),
y4=c(21,28,311,24,227,225))
CC <- paste("CC",seq(0,3,1),sep="")
aa<-test%>%
select(AC=x,AR=group,CC=y1:y4)
head(aa)
AC AR CC1 CC2 CC3 CC4
1 1 Group 1 22 22 23 21
2 1 Group 2 8 18 18 28
3 2 Group 1 11 21 51 311
4 2 Group 2 4 14 44 24
5 3 Group 1 7 17 27 227
6 3 Group 2 5 15 35 225
the problem is even I set CC value from CC0, CC1, CC2, CC3 the output gives automatically head names starting from CC1.
how can I solve this issue?
I think you'll have an easier time crating such an expression with the select_ function:
library(dplyr)
test <- data.frame(x=rep(1:3, each=2),
group=rep(c("Group 1", "Group 2"), 3),
y1=c(22, 8, 11, 4, 7, 5),
y2=c(22, 18, 21, 14, 17, 15),
y3=c(23, 18, 51, 44, 27, 35),
y4=c(21, 28, 311,24, 227, 225))
# build out our select "translation" named vector
DQ <- paste0("y", 1:4)
names(DQ) <- paste0("DQ", seq(0, 3, 1))
# take a look
DQ
## DQ0 DQ1 DQ2 DQ3
## "y1" "y2" "y3" "y4"
test %>%
select_("AC"="x", "AR"="group", .dots=DQ)
## AC AR DQ0 DQ1 DQ2 DQ3
## 1 1 Group 1 22 22 23 21
## 2 1 Group 2 8 18 18 28
## 3 2 Group 1 11 21 51 311
## 4 2 Group 2 4 14 44 24
## 5 3 Group 1 7 17 27 227
## 6 3 Group 2 5 15 35 225

Resources