data=data.frame(cat=runif(100), dog = runif(100), fox = runif(100), bunny = runif(100))
I just wish to rename such that cat = var01, dog = var04, fox = var07, bunny = var09.
We can use rename on a named vector and evaluate with (!!!)
library(dplyr)
nm1 <- c('cat', 'dog', 'fox', 'bunny')
nm2 <- c('var01', 'var04', 'var07', 'var09')
Or create it with seq
nm2 <- sprintf('var%02d', seq(1, length.out = length(nm1), by = 3))
data <- rename(data, !!! setNames(nm1, nm2))
Or with setnames from data.table to change the column names in place by providing a vector of 'old', 'new' names
library(data.table)
setDT(data)
setnames(data, nm1, nm2)
names(data)
#[1] "var01" "var04" "var07" "var09"
If you want to rename only specific columns from the data you could use
library(dplyr)
data %>%
rename(var01 = cat, var04 = dog, var07 = fox, var09 = bunny) %>%
head
# var01 var04 var07 var09
#1 0.3817939 0.82917877 0.29435146 0.07547698
#2 0.7235733 0.89619003 0.11643227 0.07026431
#3 0.2500442 0.01800189 0.02804676 0.29175499
#4 0.1229257 0.87631870 0.86204151 0.83269660
#5 0.2191805 0.90387735 0.75390315 0.59554349
#6 0.5019568 0.87161199 0.05806871 0.31988761
Related
I have the following example data.frame which contains multiple variables (i.e., A,B,C).
set.seed(123)
D1 <- data.frame(Date = seq(as.Date("2001-01-01"), to= as.Date("2001-01-10"), by="day"),
A = runif(10,1,5),
B = runif(10,3,6),
C = runif(10,2,5))
D2 <- data.frame(Date = seq(as.Date("2001-01-01"), to= as.Date("2001-01-10"), by="day"),
A = runif(10,1,5),
B = runif(10,3,6),
C = runif(10,2,5))
D3 <- data.frame(Date = seq(as.Date("2001-01-01"), to= as.Date("2001-01-10"), by="day"),
A = runif(10,1,5),
B = runif(10,3,6),
C = runif(10,2,5))
Target
I want to grab each variables from all the data.frame and save it as a new data.frame with the name set as Variable and column names set as data.frame+variable like below
A <- data.frame(Date, D1A,D2A,D3A)
B <- data.frame(Date,D1B,D2B,D3B)
C <- data.frame(Date,D1C,D2C,D3C)
I would appreciate any help here.
We get the datasets in a list (without the first column 'Date'), transpose, then loop over the list with map, bind the 'Date' column in each of those datasets (it is better to keep it as a list, but if needed use list2env to create the objects in the global env)
library(dplyr)
library(purrr)
list(D1 = D1[-1], D2 = D2[-1], D3 = D3[-1]) %>%
transpose %>%
map(~ bind_cols(D1['Date'], .)) %>%
list2env(.GlobalEnv)
-check the objects A, B, C created
head(A, 2)
# Date D1 D2 D3
#1 2001-01-01 2.150310 4.852097 3.660461
#2 2001-01-02 4.153221 4.609196 1.379363
head(B, 2)
# Date D1 D2 D3
#1 2001-01-01 5.870500 3.428400 5.263425
#2 2001-01-02 4.360002 4.243639 4.887663
head(C, 2)
# Date D1 D2 D3
#1 2001-01-01 4.668618 2.137494 2.730858
#2 2001-01-02 4.078410 3.326600 4.004167
Here is a base R option
lst <- mget(ls(pattern = "^D\\d+"))
list2env(
sapply(
names(lst[[1]])[-1],
function(x) {
cbind(
lst[[1]]["Date"],
list2DF(lapply(lst, `[[`, x))
)
},
USE.NAMES = TRUE,
simplify = FALSE
),
envir = .GlobalEnv
)
I would like to "copy paste" one column's value from df A under DF B's column values.
Below is I've visualized on what I'm trying to achieve
An option is to use bind_rows for the selected columns after making the type of the column same
library(dplyr)
bind_rows(df2, df1[1] %>%
transmute(ColumnC = as.character(ColumnA)))
# ColumnC ColumnD
#1 a b
#2 1 <NA>
#3 2 <NA>
#4 3 <NA>
data
df1 <- data.frame(ColumnA = 1:3, ColumnB = 4:6)
df2 <- data.frame(ColumnC = 'a', ColumnD = 'b',
stringsAsFactors = FALSE)
You may use also R base for this. You actually want to right join df2 with df1 :
df1 <- data.frame(1:3, 4:6)
names(df1) <- paste0("c", 1:2)
df2 <- data.frame("a", "b")
names(df2) <- paste0("c", 3:4)
# renaming column to join on
names(df2)[1] <- "c1"
merge(x = df1[,1,drop=FALSE], y = df2, by.y = c("c1"), all = TRUE)
I have a list of data.frames (in this example only 2):
set.seed(1)
df1 <- data.frame(id = sample(LETTERS,50,replace=T), val = rnorm(50), val1 = rnorm(50), stringsAsFactors = F)
df2 <- data.frame(id = sample(LETTERS,30,replace=T), val = rnorm(30), val2 = rnorm(30), stringsAsFactors = F)
df.list <- list(df1,df2)
I want to join them into a single data.frame only by a subset of the shared column names, in this case by id.
If I use:
library(dplyr)
df <- df.list %>% purrr::reduce(dplyr::inner_join,by="id")
The shared column names, which I'm not joining by, get mutated with the x and y suffices:
id val.x val1 val.y val2
1 G -0.05612874 0.2914462 2.087167 0.7876396
2 G -0.05612874 0.2914462 -0.255027 1.4411577
3 J -0.15579551 -0.4432919 -1.286301 1.0273924
In reality, for the shared column names for which I'm not joining by, it's good enough to select them from a single data.frame in the list - which ever they exist in WRT to the joined id.
I don't know these shared column names in advance but that's not difficult find out:
E.g.:
df.list.colnames <- unlist(lapply(df.list,function(l) colnames(l %>% dplyr::select(-id))))
df.list.colnames <- table(df.list.colnames)
repeating.colnames <- names(df.list.colnames)[which(df.list.colnames > 1)]
Which will then allow me to separate them from the data.frames in the list:
repeating.colnames.df <- do.call(rbind,lapply(df.list,function(r) r %>% dplyr::select_(.dots = c("id",repeating.colnames)))) %>%
unique()
I can then join the list of data.frames excluding these columns:
And then join them as above:
for(r in 1:length(df.list)) df.list[[r]] <- df.list[[r]] %>% dplyr::select_(.dots = paste0("-",repeating.colnames))
df <- df.list %>% purrr::reduce(dplyr::inner_join,by="id")
And now I'm left with adding the repeating.colnames.df to that. I don't know of any join in dplyr that wont return all combinations between df and repeating.colnames.df, so it seems that all I can do is apply over each df$id, pick the first match in repeating.colnames.df and join the result with df.
Is there anything less cumbersome for this situation?
If I followed correctly, I think you can handle this by writing a custom function to pass into reduce that identifies the common column names (excluding your joining columns) and excludes those columns from the "second" table in the merge. As reduce works through the list, the function will "accumulate" the unique columns, defaulting to the columns in the "left-most" table.
Something like this:
library(dplyr)
library(purrr)
set.seed(1)
df1 <- data.frame(id = sample(LETTERS,50,replace=T), val = rnorm(50), val1 = rnorm(50), stringsAsFactors = F)
df2 <- data.frame(id = sample(LETTERS,30,replace=T), val = rnorm(30), val2 = rnorm(30), stringsAsFactors = F)
df.list <- list(df1,df2)
fun <- function(df1, df2, by_col = "id"){
df1_names <- names(df1)
df2_names <- names(df2)
dup_cols <- intersect(df1_names[!df1_names %in% by_col], df2_names[!df2_names %in% by_col])
out <- dplyr::inner_join(df1, df2[, !(df2_names %in% dup_cols)], by = by_col)
return(out)
}
df_chase <- df.list %>% reduce(fun,by_col="id")
Created on 2019-01-15 by the reprex package (v0.2.1)
If I compare df_chase to your final solution, I yield the same answer:
> all.equal(df_chase, df_orig)
[1] TRUE
You can just get rid of the duplicate columns from one of the data frames if you say you don't really care about them and simply use base::merge:
set.seed(1)
df1 <- data.frame(id = sample(LETTERS,50,replace=T), val = rnorm(50), val1 = rnorm(50), stringsAsFactors = F)
df2 <- data.frame(id = sample(LETTERS,30,replace=T), val = rnorm(30), val2 = rnorm(30), stringsAsFactors = F)
duplicates = names(df1) == names(df2) & names(df1) !="id"
df2 = df2[,!duplicates]
df12 = base::merge.data.frame(df1, df2, by = "id")
head(df12)
Ok so here's my problem. I'm trying to scrape a ton of data off of websites. My code looks like this:
library(XML)
library(RCurl)
library(rlist)
library(rvest)
library(dplyr)
team_performance <- read.csv("C:/Users/Will/Documents/team_performance.csv")
stats_names <- read.csv("C:/Users/Will/Documents/stats_names.csv")
date_vals <- read.csv("C:/Users/Will/Documents/date_vals.csv")
teams_list <- read.csv("C:/Users/Will/Documents/teams_list.csv")
date_vals <- date_vals[[1]]
stats_names <- stats_names[[1]]
team_stats <- NULL
for(i in c(0:10)){
burner <- teams_list
burner$Year <- (2007 + i)
team_stats <- rbind(team_stats, burner)
}
names(team_stats)[[1]] <- "Team"
percent_complete <- 0
for(x in date_vals){
for(i in stats_names){
mpg_link <- getURL(paste0("https://www.teamrankings.com/ncaa- basketball/stat/",gsub(" ","-",i),"?date=",x),.opts = list(ssl.verifypeer = FALSE) )
tables <- readHTMLTable(mpg_link)
tables <- list.clean(tables, fun = is.null, recursive = FALSE)
n.rows <- unlist(lapply(tables, function(t) dim(t)[1]))
temp_data <- data.frame(tables)
temp_data$NULL.Stat <- i
names(temp_data)[3] <- temp_data$NULL.Stat[1]
names(temp_data)[2] <- "Team"
temp_data <- temp_data[,-c(4:8)]
temp_data$Year <- as.numeric(substr(as.character(x),1,4))
team_stats <- left_join(team_stats,temp_data[,-c(1,4)], by.x = "Team", by.y = "Year")
percent_complete <- percent_complete + (100/979)
print(paste(round(percent_complete,digits=2),"% complete",sep=""))
}
}
After the first year (2017) is done, after the joins are completed, I get a message like this:
Joining, by = c("Team", "Year", "Points Per Game")
instead of getting a message like this:
Joining, by = c("Team", "Year")
Any ideas why this might be happening?
Edit: Ok no longer getting the messages but it still won't switch over the year. Once it starts to scrape 2016, data doesn't show up where the year is 2016.
In the left_join, the syntax should be
left_join(team_stats,temp_data[,-c(1,4)], by=c(Team = "Year"))
though the column names are not making sense for the join. It is based on the OP's syntax.
The by.x and by.x are arguments in merge (from base R)
As a reproducible example
set.seed(24)
df1 <- data.frame(col1 = 1:5, col2 = rnorm(5))
df2 <- data.frame(A = rep(1:3, each = 2), B = rnorm(6))
The OP's method is giving errors in dplyr_0.7.4
left_join(df2, df1, by.x = 'A', by.y = 'col1')
Error: by required, because the data sources have no common
variables
because the arguments don't match
left_join(df2, df1, by = c(A= "col1"))
# A B col2
#1 1 0.266021979 -0.5458808
#2 1 0.444585270 -0.5458808
#3 2 -0.466495124 0.5365853
#4 2 -0.848370044 0.5365853
#5 3 0.002311942 0.4196231
#6 3 -1.316908124 0.4196231
I have a data frame with a number of columns in a form var1.mean, var2.mean. I would like to strip the suffix ".mean" from all columns that contain it. I tried using rename_all in conjunction with regex in a pipe but could not come up with a correct syntax. Any suggestions?
If you want to use the dplyr package, I'd recommend using the rename_at function.
Dframe <- data.frame(var1.mean = rnorm(10),
var2.mean = rnorm(10),
var1.sd = runif(10))
library(dplyr)
Dframe %>%
rename_at(.vars = vars(ends_with(".mean")),
.funs = funs(sub("[.]mean$", "", .)))
Using new dplyr:
df %>% rename_with(~str_remove(., '.mean'))
We can use rename_all
df1 %>%
rename_all(.funs = funs(sub("\\..*", "", names(df1)))) %>%
head(2)
# var1 var2 var3 var1 var2 var3
#1 -0.5458808 -0.09411013 0.5266526 -1.3546636 0.08314367 0.5916817
#2 0.5365853 -0.08554095 -1.0736261 -0.9608088 2.78494703 -0.2883407
NOTE: If the column names are duplicated, it needs to be made unique with make.unique
data
set.seed(24)
df1 <- as.data.frame(matrix(rnorm(25*6), 25, 6, dimnames = list(NULL,
paste0(paste0("var", 1:3), rep(c(".mean", ".sd"), each = 3)))))
You may use gsub.
colnames(df) <- gsub('.mean','',colnames(df))
The below works for me
dat <- data.frame(var1.mean = 1, var2.mean = 2)
col_old <- colnames(dat)
col_new <- gsub(pattern = ".mean",replacement = "", x = col_old)
colnames(dat) <- col_new
You can replace this names using stringi package stri_replace_last_regex function like this:
require(stringi)
df <- data.frame(1,2,3,4,5,6)
names(df) <- stri_paste("var",1:6,c(".mean",".sd"))
df
## var1.mean var2.sd var3.mean var4.sd var5.mean var6.sd
##1 1 2 3 4 5 6
names(df) <- stri_replace_last_regex(names(df),"\\.mean$","")
df
## var1 var2.sd var3 var4.sd var5 var6.sd
##1 1 2 3 4 5 6
The regex is \\.mean$ because you need to escape dot character (it has special meaning in regex) and also you can add $ sign at the end to ensure that you replace only names that ENDS with this pattern (if the .mean text is in the middle of string then it wan't be replaced).
I would use stringsplit:
x <- as.data.frame(matrix(runif(16), ncol = 4))
colnames(x) <- c("var1.mean", "var2.mean", "var3.mean", "something.else")
colnames(x) <- strsplit(colnames(x), split = ".mean")
colnames(x)
Lot's of quick answers have been given, the most intuitive, to me would be:
Dframe <- data.frame(var1.mean = rnorm(10), #Create Example
var2.mean = rnorm(10),
var1.sd = runif(10))
names(Dframe) <- gsub("[.]mean","",names(Dframe)) #remove ".mean"