Get single column of values comparing multiple columns - r

I have just started my journey with R. I want to test values across multiple columns for the same condition and return 5 if any of the values is "hello" within a row:
result = ifelse((myData[1] == "hello") | (myData[2] == "hello") | (myData[3] == "hello"), 5, 0)
This works fine, but code seems to be redundant. When I do:
resultSec = ifelse(myData[1:3] == "hello", 5, 0)
Then all 3 columns are checked against the condition, but the result I get is not a single column, but 3 columns. So then I would have to perform an additional comparison for all columns which makes totally more lines of code then the first redundant method.
How can I get in this case a one column of values in efficient way ?

You can use the function apply() to iterate over a data.frame or matrix, by either columns or rows. The margin argument determines which one you use.
Here we want to check the rows, so we use margin = 1:
dat <- data.frame(col1 = c("happy", "sad", "mad"),
col2 = c("tired", "sleepy", "happy"),
col3 = c("relaxed", "focused", "fine"))
dat$res <- apply(X = dat, MARGIN = 1,
FUN = function(x) ifelse("happy" %in% x, 5, 0))
dat
col1 col2 col3 res
1 happy tired relaxed 5
2 sad sleepy focused 0
3 mad happy fine 5

We can use rowSums here
df1$res <- rowSums(df1 == "happy") * 5
df1$res
#[1] 5 0 5
data
df1 <- structure(list(col1 = structure(c(1L, 3L, 2L), .Label = c("happy",
"mad", "sad"), class = "factor"), col2 = structure(c(3L, 2L,
1L), .Label = c("happy", "sleepy", "tired"), class = "factor"),
col3 = structure(c(3L, 2L, 1L), .Label = c("fine", "focused",
"relaxed"), class = "factor")), .Names = c("col1", "col2",
"col3"), row.names = c(NA, -3L), class = "data.frame")

Related

Check if value from one data frame can be found in another data frame in R

I have two data.frames as follows:
a$id <- as.data.frame(c("1-23-2", "2-3-231-2", "122-121"))
b$id <- as.data.frame(c("1-23-2", "122-121", "12-1223-12", "1221-12"))
I want to check, if all values of a can be found in b.
I tried this:
if (a$id %in% b$id){a$test <- "yes"} else {a$test <- "no"}
Which gives a warning message and the wrong result unfortunately.
Use ifelse.
a$test <- ifelse(a$id %in% b$id, "yeah", "no")
a
# id test
# 1 1-23-2 yeah
# 2 2-3-231-2 no
# 3 122-121 yeah
Data
a <- structure(list(id = structure(c(1L, 3L, 2L), .Label = c("1-23-2",
"122-121", "2-3-231-2"), class = "factor")), class = "data.frame", row.names = c(NA,
-3L))
b <- structure(list(id = structure(c(1L, 3L, 2L, 4L), .Label = c("1-23-2",
"12-1223-12", "122-121", "1221-12"), class = "factor")), class = "data.frame", row.names = c(NA,
-4L))
You may have several base R approaches to make it, e.g.,
a <- within(a,test <- ifelse(id %in% b$id,"yes","no"))
or
a <- within(a,test <- c("yes","no")[(!id%in% b$id) + 1])
or
a <- within(a,test <- c("yes","no")[is.na(match(id,b$id))+1])
such that
> a
id test
1 1-23-2 yes
2 2-3-231-2 no
3 122-121 yes
DATA
a <- data.frame(id = c("1-23-2", "2-3-231-2", "122-121"))
b <- data.frame(id = c("1-23-2", "122-121", "12-1223-12", "1221-12"))

How do I split a column in R into two columns when I have no delimiter?

I have a dataset called data1 that I need to split the first column into two columns. The issue I'm having is that there is no delimiter between what I need to split and the character lengths are different is many rows.
I would like to split it by the date and sex.
E.g
12/1/09male
1/9/20female
13/1/19female
4/12/12male
I've been trying this but because the values have a different amount of characters I'm stuck.
separate(data1, col = 1, into = c("date","sex"), sep = "")
Any help would be hugely appreciated!
An option is a positive look-behind and look-ahead to split on a digit followed by an "m" or "f".
df %>% separate(1, c("date", "sex"), sep = "(?<=\\d)(?=[mf])")
# date sex
#1 12/1/09 male
#2 1/9/20 female
#3 13/1/19 female
#4 4/12/12 male
For what it's worth, the same regexp pattern works in base R's strsplit
setNames(do.call(
rbind.data.frame,
strsplit(as.character(df[, 1]), "(?<=\\d)(?=[mf])", perl = T)),
c("date", "sex"))
Sample data
df <- read.table(text =
'12/1/09male
1/9/20female
13/1/19female
4/12/12male')
I am fairly new to R so I am sure this is not the most elegant solution. I first add a comma between the date and sex and then separate on the comma
a <- data.frame(row_1 = c("12/1/09male", "1/9/20female", "13/1/19female", "4/12/12male"))
a[, "row_1"] = str_replace(a$row_1, "(male|female)", ",\\1")
separate(a, row_1, ",", into = c("date", "sex"))
Using tidyr::extract, we can capture data into two parts. First capture the date (in the format d/m/y) and second capture all the remaining part of the string.
tidyr::extract(df, V1, c("date", "sex"), "(\\d+/\\d+/\\d+)(.*)")
# date sex
#1 12/1/09 male
#2 1/9/20 female
#3 13/1/19 female
#4 4/12/12 male
data
df <- structure(list(V1 = structure(c(2L, 1L, 3L, 4L), .Label = c("1/9/20female",
"12/1/09male", "13/1/19female", "4/12/12male"), class = "factor")),
class = "data.frame", row.names = c(NA,-4L))
Base R solution using gsub and some regex:
df_clean <- within(df, {
date <- as.Date(gsub("[A-Za-z]+", "", V1), format = "%d/%m/%y")
sex <- as.factor(gsub("\\d+|\\/", "", V1))
rm(V1)
}
)
Data:
df <- structure(list(V1 = structure(c(2L, 1L, 3L, 4L), .Label = c("1/9/20female",
"12/1/09male", "13/1/19female", "4/12/12male"), class = "factor")), class = "data.frame", row.names = c(NA,
-4L))

Convert lines from factor to numeric?

this example:
dat=structure(list(X = structure(c(1L, 2L,3L), .Label = c("A", "B", "C"), class = "factor"), X10 = structure(c(1L,2L,3L), .Label = c("3","0", "2"), class = "factor"), X11 = structure(c(1L, 2L,3L), .Label = c("0", "2", "0"), class = "factor")), class = "data.frame", row.names = c(NA, -3L))
dat=dat[,-1]
fi=as.numeric(as.character(dat[1,] ))
> fi
[1] 1 1
Which is not correct. I wonder what is wrong ?
as.numeric is for vector, you need to use apply if you want to apply this to a data frame:
apply(dat, MARGIN=2,FUN=as.numeric)
result:
X10 X11
[1,] 3 0
[2,] 0 2
[3,] 2 0
For multiple columns of different class, we can have a check whether it is factor or not to do the conversion
library(dplyr)
dat %>%
mutate_if(is.factor, funs(as.numeric(as.character(.))))
and if all the columns are factor, then use mutate_all
dat %>%
mutate_all(funs(as.numeric(as.character(.))))
The base R way if all columns are factor, use lapply and assign it to the original object
dat[] <- lapply(dat, function(x) as.numeric(as.character(x)))

R Creating Dynamic variables from group aggregated set of DataFrames

My problem statement is I have a list of dataframes as df1,df2,df3.Data is like
df1
a,b,c,d
1,2,3,4
1,2,3,4
df2
a,b,c,d
1,2,3,4
1,2,3,4
Now, for these two dataframe I should create a new dataframe taking aggregated column of those two dataframes ,for that I am using below code
for(i in 1:2){
assign(paste(final_val,i,sep=''),sum(assign(paste(df,i,sep='')))$d*100)}
I am getting the error:
Error in assign(paste(hvp_route_dsct_clust, i, sep = "")) :
argument "value" is missing, with no default
My output should look like
final_val1 <- 800
final_val2 <- 800
And for those values final_val1,final_val2 I should be creating dataframe dynamicaly
Can anybody please help me on this
If we need to use assign, get the object names from the global environment with ls by specifying the pattern 'df' followed by one or more numbers (\\d+), create another vector of 'final_val's ('nm1'), loop through the sequence of 'nm1', assign each of the element in 'nm2' to the value we got from extracting the column 'd' of each 'df's multiplied by 100 and taking its sum.
nm1 <- ls(pattern = "df\\d+")
nm2 <- paste0("final_val", seq_along(nm1))
for(i in seq_along(nm1)){
assign(nm2[i], sum(get(nm1[i])$d*100))
}
final_val1
#[1] 800
final_val2
#[1] 800
Otherwise, we place the datasets in a list, extract the 'd' column, multiply with 100 and do the column sums
unname(colSums(sapply(mget(nm1), `[[`, 'd') * 100))
#800 800
data
df1 <- structure(list(a = c(1L, 1L), b = c(2L, 2L), c = c(3L, 3L), d = c(4L,
4L)), .Names = c("a", "b", "c", "d"), class = "data.frame", row.names = c(NA,
-2L))
df2 <- structure(list(a = c(1L, 1L), b = c(2L, 2L), c = c(3L, 3L), d = c(4L,
4L)), .Names = c("a", "b", "c", "d"), class = "data.frame", row.names = c(NA,
-2L))

R: remove columns based on two column's similarity check

Input
row.no column2 column3 column4
1 bb ee up
2 bb ee down
3 bb ee up
4 bb yy down
5 bb zz up
I have a rule to remove row 1 and 2 and 3, as while column2 and column3 for row 1, 2 and 3 are the same, contradictory data (up and down) are found in column 4.
How can I ask R to remove those rows with same name in column2 and column3 but contracting column 3 to result a matrix as follows:
row.no column2 column3 column4
4 bb yy down
5 bb zz up
The functions in package plyr really shine at this type of problem. Here is a solution using two lines of code.
Set up the data (kindly provided by #GavinSimpson)
dat <- structure(list(row.no = 1:5, column2 = structure(c(1L, 1L, 1L,
1L, 1L), .Label = "bb", class = "factor"), column3 = structure(c(1L,
1L, 1L, 2L, 3L), .Label = c("ee", "yy", "zz"), class = "factor"),
column4 = structure(c(2L, 1L, 2L, 1L, 2L), .Label = c("down",
"up"), class = "factor")), .Names = c("row.no", "column2",
"column3", "column4"), class = "data.frame", row.names = c(NA,
-5L))
Load the plyr package
library(plyr)
Use ddply to split, analyse and combine dat. The following line of code analyses splits dat into unique combination of (column2 and column3) separately. I then add a column called unique, which calculates the number of unique values of column4 for each set. Finally, use a simple subsetting to return only those lines where unique==1, and drop column 5.
df <- ddply(dat, .(column2, column3), transform,
row.no=row.no, unique=length(unique(column4)))
df[df$unique==1, -5]
And the results:
row.no column2 column3 column4
4 4 bb yy down
5 5 bb zz up
Here is one potential, if somewhat inelegant, solution
out <- with(dat, split(dat, interaction(column2, column3)))
out <- lapply(out, function(x) if(NROW(x) > 1) {NULL} else {data.frame(x)})
out <- out[!sapply(out, is.null)]
do.call(rbind, out)
Which gives:
> do.call(rbind, out)
row.no column2 column3 column4
bb.yy 4 bb yy down
bb.zz 5 bb zz up
Some explanation, line by line:
Line 1: splits the data into a list, each component of which is a data frame with rows corresponding to groups formed by unique combinations of column2 and column3.
Line 2: iterate over the result from Line 1; if there are more than 1 row in data frame, return NULL, if not return the 1-row data frame.
Line 3: iterate over the output from Line 2; return only non-NULL components
Line 4: need to bind, row-wise, the output from Line 3, which we arrange via do.call()
This can be simplified to two lines, combining Lines 1-3 into a single line:
out <- lapply(with(dat, split(dat, interaction(column2, column3))),
function(x) if(NROW(x) > 1) {NULL} else {data.frame(x)})
do.call(rbind, out[!sapply(out, is.null)])
The above was all done with:
dat <- structure(list(row.no = 1:5, column2 = structure(c(1L, 1L, 1L,
1L, 1L), .Label = "bb", class = "factor"), column3 = structure(c(1L,
1L, 1L, 2L, 3L), .Label = c("ee", "yy", "zz"), class = "factor"),
column4 = structure(c(2L, 1L, 2L, 1L, 2L), .Label = c("down",
"up"), class = "factor")), .Names = c("row.no", "column2",
"column3", "column4"), class = "data.frame", row.names = c(NA,
-5L))
Gavin keeps raising the bar on the quality of answers. Here's my attempt.
# This is one way of importing the data into R
sally <- textConnection("row.no column2 column3 column4
1 bb ee up
2 bb ee down
3 bb ee up
4 bb yy down
5 bb zz up")
sally <- read.table(sally, header = TRUE)
# Order the data frame to make rle work its magic
sally <- sally[order(sally$column3, sally$column4), ]
# Find which values are repeating
sally.rle2 <- rle(as.character(sally$column2))
sally.rle3 <- rle(as.character(sally$column3))
sally.rle4 <- rle(as.character(sally$oclumn4))
sally.can.wait2 <- sally.rle2$values[which(sally.rle3$lengths != 1)]
sally.can.wait3 <- sally.rle3$values[which(sally.rle3$lengths != 1)]
sally.can.wait4 <- sally.rle4$values[which(sally.rle4$lengths != 1)]
# Find which lines have values that are repeating
dup <- c(which(sally$column2 == sally.can.wait2),
which(sally$column3 == sally.can.wait3),
which(sally$column4 == sally.can.wait4))
dup <- dup[duplicated(dup)]
# Display the lines that have no repeating values
sally[-dup, ]
You can try one of the following two methods. Suppose the table is called 'table1'.
Method 1
repeated_rows = c();
for (i in 1:(nrow(table1)-1)){
for (j in (i+1):nrow(table1)){
if (sum((table1[i,2:3] == table1[j,2:3])) == 2){
repeated_rows = c(repeated_rows, i, j)
}
}
}
repeated_rows = unique(repeated_rows)
table1[-repeated_rows,]
Method 2
duplicates = duplicated(table1[,2:3])
for (i in 1:length(duplicates)){
if (duplicates[i] == TRUE){
for (j in 1:nrow(table1)){
if (sum(table1[i,2:3] == table1[j,2:3]) == 2){
duplicates[j] = TRUE;
}
}
}
}
table1[!duplicates,]

Resources