I have a data frame where I would like to put in front of a column name the following words: "high_" and "low_". The name of the columns from X2-X4 should be renamed eg.high_X2 and X5-X7 eg. low_X6.
Please see an example below.
X1 X2 X3 X4 X5 X6 X7
a 1 0 1 1 1 1 0
b 2 2 1 1 1 1 0
result
X1 high_X2 high_X3 high_X4 low_X5 low_X6 low_X7
a 1 0 1 1 1 1 0
b 2 2 1 1 1 1 0
You can use rep and paste -
names(df)[-1] <- paste(rep(c('high', 'low'), each = 3), names(df)[-1], sep = '_')
df
# X1 high_X2 high_X3 high_X4 low_X5 low_X6 low_X7
#a 1 0 1 1 1 1 0
#b 2 2 1 1 1 1 0
If you want to rely on range of columns then dplyr code would be easier.
library(dplyr)
df %>%
rename_with(~paste('high', ., sep = '_'), X2:X4) %>%
rename_with(~paste('low', ., sep = '_'), X5:X7)
The base solution (which is more straitforward for these kind of things imo)
df <- data.frame(X1=c(a=1L,b=2L),
X2=c(a=0L,b=2L),
X3=c(a=1L,b=1L),
X4=c(a=1L,b=1L),
X5=c(a=1L,b=1L),
X6=c(a=1L,b=1L),
X7=c(a=1L,b=1L))
cn <- colnames(df)
cond <- as.integer(substr(cn,2L,nchar(cn))) %% 2L == 0L
colnames(df)[cond] <- paste0(cn[cond],"_is_pair")
A tidyverse solution (a bit more awkward due to the tidyeval)
library(dplyr)
library(stringr)
library(tidyselect)
df <- data.frame(X1=c(a=1L,b=2L),
X2=c(a=0L,b=2L),
X3=c(a=1L,b=1L),
X4=c(a=1L,b=1L),
X5=c(a=1L,b=1L),
X6=c(a=1L,b=1L),
X7=c(a=1L,b=1L))
is_pair <- function(vars = peek_vars(fn = "is_pair")) {
vars[as.integer(str_sub(vars,2L,nchar(vars))) %% 2L == 0L]
}
df %>% rename_with(~paste0(.x,"_is_pair"),
is_pair())
Related
I have a dataset to which I am trying to apply a ruleset. I would like to find out which rule an observation hit (if any) and record the result.
Here's an example. The first data frame df contains the observations. The second contains the ruleset rs. The third contains the desired result fn.
My question is how do I take the rule strings, apply each one to each observation until I get a match and then record which rule was hit? I would prefer a tidy solution but this seems like it might require a loop. Any insights are appreciated.
df <- data.frame(ID = c("A", "B", "C"),
x1 = c(1, 2, 3),
x2 = c(0L, 1L, 0L))
rs <- data.frame(RID = c(1, 2),
Rule = c("x1 <= 2 & x2 == 0L",
"x1 > 2 & x2 == 0L"))
fn <- data.frame(ID = c("A", "B", "C"),
x1 = c(1, 2, 3),
x2 = c(0L, 1L, 0L),
Rule = c(1, NA, 2))
> df
ID x1 x2
1 A 1 0
2 B 2 1
3 C 3 0
> rs
RID Rule
1 1 "x1 <= 2 & x2 == 0L"
2 2 "x1 > 2 & x2 == 0L"
> fn
ID x1 x2 Rule
1 A 1 0 1
2 B 2 1 NA
3 C 3 0 2
Try this using parse and eval. The output is a list because both rules can apply.
library(dplyr)
df %>%
rowwise() %>%
mutate(Rule = list(rs$RID[sapply(rs$Rule, function(x)
(eval(parse(t=x))))])) %>%
data.frame()
ID x1 x2 Rule
1 A 1 0 1
2 B 2 1
3 C 3 0 2
Edit: for big data sets maybe try data.table
library(data.table)
setDT(df)
df[, Rule := rs$RID[sapply(rs$Rule, function(x)
eval(parse(t=x)))], by=1:NROW(df)]
df
ID x1 x2 Rule
1: A 1 0 1
2: B 2 1 NA
3: C 3 0 2
This question is probably best illustrated with an example.
Suppose I have a dataframe df with a binary variable b (values of b are 0 or 1). How can I take a random sample of size 10 from this dataframe so that I have 2 instances where b=0 in the random sample, and 8 instances where b=1 in the dataframe?
Right now, I know that I can do df[sample(nrow(df),10,] to get part of the answer, but that would give me a random amount of 0 and 1 instances. How can I specify a specific amount of 0 and 1 instances while still taking a random sample?
Here's an example of how I'd do this... take two samples and combine them. I've written a simple function so you can "just take one sample."
With a vector:
pop <- sample(c(0,1), 100, replace = TRUE)
yoursample <- function(pop, n_zero, n_one){
c(sample(pop[pop == 0], n_zero),
sample(pop[pop == 1], n_one))
}
yoursample(pop, n_zero = 2, n_one = 8)
[1] 0 0 1 1 1 1 1 1 1 1
Or, if you are working with a dataframe with some unique index called id:
# Where d1 is your data you are summarizing with mean and sd
dat <- data.frame(
id = 1:100,
val = sample(c(0,1), 100, replace = TRUE),
d1 = runif(100))
yoursample <- function(dat, n_zero, n_one){
c(sample(dat[dat$val == 0,"id"], n_zero),
sample(dat[dat$val == 1,"id"], n_one))
}
sample_ids <- yoursample(dat, n_zero = 2, n_one = 8)
sample_ids
mean(dat[dat$id %in% sample_ids,"d1"])
sd(dat[dat$id %in% sample_ids,"d1"])
Here is a suggestion:
First create a sample of 0 and 1 with id column.
Then sample 2:8 df's with condition and bind them together:
library(tidyverse)
set.seed(123)
df <- as_tibble(sample(0:1,size=50,replace=TRUE)) %>%
mutate(id = row_number())
df1 <- df[ sample(which (df$value ==0) ,2), ]
df2 <- df[ sample(which (df$value ==1), 8), ]
df_final <- bind_rows(df1, df2)
value id
<int> <int>
1 0 14
2 0 36
3 1 21
4 1 24
5 1 2
6 1 50
7 1 49
8 1 41
9 1 28
10 1 33
library(tidyverse)
set.seed(123)
df <- data.frame(a = letters,
b = sample(c(0,1),26,T))
bind_rows(
df %>%
filter(b == 0) %>%
sample_n(2),
df %>%
filter(b == 1) %>%
sample_n(8)
) %>%
arrange(a)
a b
1 d 1
2 g 1
3 h 1
4 l 1
5 m 1
6 o 1
7 p 0
8 q 1
9 s 0
10 v 1
I have a dataframe like below
x1 = c('a','bd','c',NA)
x2 = c('cd', 'fd', 'g', 'ew')
df <- as.data.frame(rbind(x1,x2), stringsAsFactors = FALSE)
And I want to convert this dataframe to like below dataframe
I was trying to spread original dataframe using tidyr.
How can I convert this dataframe?
We can reshape into 'long' format with pivot_longer after creating a column of rownames (rownames_to_column), and then convert it back to 'wide' with pivot_wider and change the column 'rn' back to rownames (column_to_rownames)
library(dplyr)
library(tidyr)
library(tibble)
df %>%
rownames_to_column('rn') %>%
pivot_longer(cols = -rn, values_drop_na = TRUE) %>%
mutate(n = 1) %>%
select(-name) %>%
pivot_wider(names_from = value, values_from = n, values_fill = list(n = 0)) %>%
column_to_rownames('rn')
# a bd c cd fd g ew
#x1 1 1 1 0 0 0 0
#x2 0 0 0 1 1 1 1
Or using table from base R
table(rep(row.names(df), ncol(df)), unlist(df))
# a bd c cd ew fd g
# x1 1 1 1 0 0 0 0
# x2 0 0 0 1 1 1 1
Here is a base R solution
u <- na.omit(as.vector(t(df)))
dfout <- `names<-`(data.frame(t(apply(df, 1, function(x) +(u %in% x) ))),u)
such that
> dfout
a bd c cd fd g ew
x1 1 1 1 0 0 0 0
x2 0 0 0 1 1 1 1
I have a data-frame with these characteristics:
Z Y X1 X2 X3 X4 X5 ... X30
A n1 1 2 1 2 1 2 1 2
B n2 1 2 1 2 1 2 1 2
C n3 1 2 1 2 1 2 1 2
D n4 1 2 1 2 1 2 1 2
.
.
.
My purpose is to stack the column x1, x2, … x30, and associated the new column with columns z, y, and x. Some like this:
Newcolumn zyx
1 x-y-z
... I need a data-frame like this:
colum1 colum2
1 A+n1+X1.headername 1
2 B+n2+X2.headernam 2
3 C+n3X3.headername 1
4 D+n4X4.headername 2
. .
. .
. .
I’m trying to build a function, but I have some troubles
I follow this code for the data-frame:
df$zy <- paste(df$z,"-",df$y)
After that, I eliminate the columns “z” and “y”:
df$z <- NULL
df$y <- NULL
And save column df$zy as data-frame for use later:
df_zy <- as.data.frame(df$zy)
Then eliminate df$xy of original dataframe:
df$xy <- NULL
After that, I save as data-frame the column x1, and incorporate df_zy and name of column x1 (the name is “1”):
a <- as.data.frame(df$`1`)
b <- cbind(a, df_xy, x_column= 1)
b$zy <- paste(b$x_column,"-",b$` df$zy`)
b$` df$zy ` <- NULL
b$ x_column <- NULL
colnames(b)
names(b)[names(b) == "b$`1`"] <- "new_column"
This works, but only for the column x1 and I need this for x1 to x30, and stack all new column
Does anybody have an answer to this problem? Thanks!
You can use tidyr and dplyr librairies:
library(dplyr)
library(tidyr)
df_zy = df %>% pivot_longer(., cols = starts_with("X"), names_to = "Variables", values_to = "Value") %>%
mutate(NewColumn = paste0(Z,"-",Y,"-",Variables)) %>% select(NewColumn, Value)
And you get:
> df_zy
# A tibble: 8 x 2
NewColumn Value
<chr> <dbl>
1 A-n1-X1 1
2 A-n1-X2 2
3 B-n2-X1 1
4 B-n2-X2 2
5 C-n3-X1 1
6 C-n3-X2 2
7 D-n4-X1 1
8 D-n4-X2 2
Data
df = data.frame("Z" = LETTERS[1:4],
"Y" = c("n1","n2","n3","n4"),
"X1" = c(1,1,1,1),
"X2" = c(2,2,2,2))
Is it what you are looking for ?
My Data:
A/11:36/0,A/11:36/1,A/11:36/2,A/23:01/0,A/23:01/1,A/23:01/2,B/15:07/0,B/15:07/1,B/15:07/2
1,26,2,1,10,2,1,0,0
Output Expecting:
Name 0 1 2
A/11:36 1 26 2
A/23:01 1 10 2
B/15:07 1 0 0
My Code
library(reshape)
library(library(splitstackshape))
input <- read.csv("D:/input.csv")
t_input <- t(input)
colnames(t_input)<- c("Name","Val")
data<-cSplit(t_input, 'V1', sep="/", type.convert=FALSE)
# here am going wrong, My script splitting the column1 into 3 columns.
final_data <- cast(data, X1~X2)
I need help on spliting my column 1 into two as follows :
A/11:36 0
A/11:36 1
A/11:36 2
A/23:01 0
A/23:01 1
A/23:01 2
B/15:07 0
B/15:07 1
B/15:07 2
Can anybody help me to solve this ?
Here's a tidyr solution:
# read the sample data
data <- read.csv("input.csv", header = F)
tdata <- t(data)
colnames(tdata) <- c("name", "value")
df <- data.frame(tdata)
library(tidyr)
new_df <- df %>%
# extract the variables stored in 'name' to their own columns
separate(name, c("group", "time", "x"), "/") %>%
# transform to wide format
spread(x, value, sep = "")
# final result
new_df
# group time x0 x1 x2
# 1 A 11:36 1 26 2
# 2 A 23:01 1 10 2
# 3 B 15:07 1 0 0
# if, for some reason, you really want the group and time columns together
new_df %>% unite(name, group, time, sep = "/")
# name x0 x1 x2
# 1 A/11:36 1 26 2
# 2 A/23:01 1 10 2
# 3 B/15:07 1 0 0
# or if you want them together and skip the unite step, you can separate directly
# by splitting at a / that is not followed by another / anywhere in the string
df %>%
separate(name, c("name", "x"), "/(?!.*/)") %>%
spread(x, value, sep = "")
# name x0 x1 x2
# 1 A/11:36 1 26 2
# 2 A/23:01 1 10 2
# 3 B/15:07 1 0 0
# read the sample data
input <- read.csv("input.csv", header=FALSE)
t_input <- t(input)
colnames(t_input) <- c("name", "value")
df <- data.frame(t_input)
library(splitstackshape)
new_df <- cSplit(t_input, 'name', sep="/", type.convert=FALSE)
df1 <- reshape(new_df, timevar=c("name_3"), idvar = c("name_1",'name_2'), dir="wide")
df2 <- within(df1, Name <- paste(name_1, name_2, sep='/'))
df2[,c("name_1","name_2"):=NULL]
Finaldf <- subset(df2, select=c(Name,value.0:value.2))
write.csv(Finaldf, "output.csv", row.names = FALSE)
output
Name value.0 value.1 value.2
A/11:36 1 26 2
A/23:01 1 10 2
B/15:07 1 0 0