I have a dataframe with 1000 columns of data
str(MT)
'data.frame': 1356 obs. of 1000 variables:
$ Date : Factor w/ 1356 levels "Apr-1900","Apr-1901",..: 453 340 792 1 905 679 566 114 1244 1131 ...
$ Year : int 1900 1900 1900 1900 1900 1900 1900 1900 1900 1900 ...
$ X1 : num -27.4 -27.8 -17 1.7 7.9 ...
$ X2 : num -27.21 -27.99 -17.05 1.69 7.75 ...
$ X3 : num -26.67 -27.84 -16.75 2.24 7.82 ...
$ X4 : num -26.64 -27.98 -16.83 2.46 7.97 ...
.....
$ X1000 : num -29.13 -30.61 -20.47 -0.46 6.5
I would like to split this dataframe into three columns ( Date, Year and Xn) using a loop so that the end of it all I will have 1000 separate csv files with 3 columns of data. My codes thus far is
for (i in ncol(MT)) {
x[[i]]<-data.frame(MT$Date, Year, MT$[[i]]) }
However, is giving me errors. Your guidance would be appreciated as this I am new to R
Your code has some syntax and algorithm errors:
Your for loop is not looping through a range of values, it's "looping" once for i = ncol(MT), it should be (i in 1:ncol(MT)) ;
Actually, you shouldn't loop through all columns, since two of them aren't Xn, so (i in 1:(ncol(MT)-2));
It's not clear if you did, but you should create x before trying to allocate data to it, preferably with its final size;
You didn't use MT$ to select the Year column;
You used both $ and [[ to subset the Xn column. You should use just [ instead, because this way you get to use i and keep the column name.
Fixing all these, with some example data, you get:
MT <- data.frame(Date = rnorm(5), Year = rnorm(5), X1 = rnorm(5), X2 = rnorm(5), X3 = rnorm(5))
nX <- ncol(MT)-2
listofdf <- lapply(1:nX, function(x) NULL)
for (i in 1:nX) {
listofdf[[i]] <- data.frame(MT$Date, MT$Year, MT[i+2])
}
listofdf
# [[1]]
# MT.Date MT.Year X1
# 1 -0.94184053 1.0241134 -0.4329728
# 2 0.59637577 -0.6195477 -1.3011527
# 3 0.33474278 1.0628674 -0.8957239
# 4 -0.04328685 0.4275993 -0.7840214
# 5 0.78799652 0.5707058 -0.4243622
#
# [[2]]
# MT.Date MT.Year X2
# 1 -0.94184053 1.0241134 2.2380838
# 2 0.59637577 -0.6195477 -0.9995170
# 3 0.33474278 1.0628674 0.3452450
# 4 -0.04328685 0.4275993 -1.0453718
# 5 0.78799652 0.5707058 -0.6292885
#
# [[3]]
# MT.Date MT.Year X3
# 1 -0.94184053 1.0241134 -0.05293727
# 2 0.59637577 -0.6195477 0.84947635
# 3 0.33474278 1.0628674 1.17748809
# 4 -0.04328685 0.4275993 1.73233398
# 5 0.78799652 0.5707058 -0.61874653
If you're just going to save them as .csv files, it's not necessary to store in a list though. Instead, you can use:
for (i in 1:nX) {
tempdf <- data.frame(MT$Date, MT$Year, MT[i+2])
write.csv(tempdf, paste0("MT_subset_X", i, ".csv"))
}
Reusing the sample data created by #Molx, and doing some reshaping as #Neal Fultz suggested in comments, using tidyr
# generate sample data
MT <- data.frame(Date = rnorm(5), Year = rnorm(5), X1 = rnorm(5), X2 = rnorm(5), X3 = rnorm(5))
Then fit all variables and values excluding Date and Year as key-value column pairs
> require(tidyr)
> MTg <- gather(MT, var, value, -c(Date, Year))
> MTg
Date Year var value
1 -1.5356474 -1.0963886 X1 -0.74075807
2 -1.1346928 0.2925819 X1 1.42787059
3 0.7031032 0.3361561 X1 -0.27112156
4 1.0140557 1.2587298 X1 0.85693377
5 0.2529787 -3.0113663 X1 0.12686607
6 -1.5356474 -1.0963886 X2 0.21406288
7 -1.1346928 0.2925819 X2 -1.11363330
8 0.7031032 0.3361561 X2 -0.30324978
9 1.0140557 1.2587298 X2 0.48954893
10 0.2529787 -3.0113663 X2 0.85898166
11 -1.5356474 -1.0963886 X3 -0.44394680
12 -1.1346928 0.2925819 X3 -0.86942530
13 0.7031032 0.3361561 X3 -1.62344294
14 1.0140557 1.2587298 X3 0.09880026
15 0.2529787 -3.0113663 X3 -0.76091871
Then run through all possible variable names, exporting them into individual csv files with same name as var.
varnames <- levels(MTg$var) # get variable names
dummy <- lapply(varnames, function(x)
write.csv(MTg[MTg$var==x,], file=paste0(x, ".csv"))
Related
I have a series of .txt files that look like this:
Button,Intensity,Acc,Intensity,RT,Time
0,30,0,0,0,77987.931
1,30,1,13.5,0,78084.57
1,30,1,15,0,78098.624
1,30,1,6,0,78114.132
1,30,1,15,0,78120.669
They have file names like 1531_Day49.txt, 1531_Day50.txt, 1532_Day49.txt, 1532_Day50.txt etc
I want to load all the files in this directory into data frames, append a column that is the difference between the Time in the row above (tdelta), and append two columns that are the first 4 digits (i.e. 1531, 1532) and one column that's the Day code uncoded so the column title would be PrePost and each row would be, if filename Day49, then "Pre" and if filename Day50 then "Post".
So ideal output for a 1531 Day 49 file would be:
Button,Intensity,Acc,Intensity,RT,Time,Tdelta,ID,PrePost
0,30,0,0,0,77987.931,0 ,1531,Pre
1,30,1,13.5,0,78084.57,96.693 ,1531,Pre
1,30,1,15,0,78098.624, 14.054,1531,Pre
So far I have:
#call library
library(data.table)
#batch enter .txt files and put them into a data frame
setwd("~/Documents/PVTPASAT/PVT")
temp = list.files(pattern="*.txt")
list.DFs <- lapply(myfiles,fread)
#view print out to visually check
View(list.DFs)
#add column of time difference
list.DFs <- lapply(list.DFs, cbind, tDelta = c(0, diff(df$Time)))
#Add empty columns for ID and PrePost
list.DFs <- lapply(list.DFs, cbind, ID = c(""))
list.DFs <- lapply(list.DFs, cbind, PrePost = c(""))
#print one to visually check
View(list.DFs[3])
I would create a function to do the processing and then apply it to your list of files like so:
example <- read.delim(textConnection('
Button, Intensity, Acc, Intensity, RT, Time
0,30,0,0,0,77987.931
1,30,1,13.5,0,78084.57
1,30,1,15,0,78098.624
1,30,1,6,0,78114.132
1,30,1,15,0,78120.669'),
header = T,
sep = ','
)
write.table(example, '1531_Day49.txt', row.names = F)
temp <- list.files(pattern="*.txt")
process_txt <- function(x) {
dat <- data.table::fread(x, header = T)
dat$tdelta <- c(0, diff(dat$Time))
dat$ID <- substr(x, 1, 4)
dat$PrePost <- if (grepl('49\\.', x)) {'Pre'} else {'Post'}
dat
}
out <- lapply(temp, process_txt)
#Heather, the main guidance is to first solve properly one file. Then, place all that working code into a function.
library(dplyr) ## for lag function
library(stringr) ## for str_detect
# make two test files
dt <- read.csv(text=
'Button,Intensity,Acc,Intensity,RT,Time
0,30,0,0,0,77987.931
1,30,1,13.5,0,78084.57
1,30,1,15,0,78098.624
1,30,1,6,0,78114.132
1,30,1,15,0,78120.669
')
write.csv(dt,"1531_Day49.txt")
write.csv(dt,"1532_Day50.txt")
# function to do the work for one file name - returns a dataframe
doOne <- function (file) {
# read
contents <- fread(file)
# compute delta
contents$Tdelta <- contents$Time - lag(contents$Time)
# prefix up to underscore
contents$ID <- strsplit(file, c("_"))[[1]][[1]]
# add the prepost using ifelse and str_detetct
contents$PrePost <- ifelse(str_detect(file, "Day49"), "Pre", "Post")
return(contents)
}
#test files
files <- c("1531_Day49.txt", "1532_Day50.txt")
# call the function for each file -- result is
# a list of dataframes
lapply(files, doOne)
# better get them all into a single data frame for analysis
do.call(rbind, lapply(files, doOne))
# V1 Button Intensity Acc Intensity.1 RT Time Tdelta ID PrePost
# 1: 1 0 30 0 0.0 0 77987.93 NA 1531 Pre
# 2: 2 1 30 1 13.5 0 78084.57 96.639 1531 Pre
# 3: 3 1 30 1 15.0 0 78098.62 14.054 1531 Pre
# 4: 4 1 30 1 6.0 0 78114.13 15.508 1531 Pre
# 5: 5 1 30 1 15.0 0 78120.67 6.537 1531 Pre
# 6: 1 0 30 0 0.0 0 77987.93 NA 1532 Post
# 7: 2 1 30 1 13.5 0 78084.57 96.639 1532 Post
# 8: 3 1 30 1 15.0 0 78098.62 14.054 1532 Post
# 9: 4 1 30 1 6.0 0 78114.13 15.508 1532 Post
# 10: 5 1 30 1 15.0 0 78120.67 6.537 1532 Post
I have a function that I want to use it but the inputs are from a text file.
Here is the Fun:
myfun <- function(latitude,longitude) {
column =latitude*5
row =longitude*3
return(c(column, row))
}
Now I have a text file with information for my function.
cor=read.table("C:\\Data\\AMS.txt", sep="")
head(cor)
V1 V2 V3 V4 V5 V6
1 lat 13 lon 2 Site: As
2 lat 14 lon 3 Site: Ad
Output needed for instance:
lat lon column row site
13 2 ? ? As
I can do this manually but as I have many, it would be better to let R do it. Any hints are appreciated
Try:
data.frame(lat=DF$V2, lon=DF$V4, column=DF$V2*5, row=DF$V4*3, site=DF$V6)
No need for cleaning.
Data
DF <- read.table(text=" V1 V2 V3 V4 V5 V6
1 lat 13 lon 2 Site: As
2 lat 14 lon 3 Site: Ad", header=T)
> DF1 <- data.frame(lat=DF$V2, lon=DF$V4, column=DF$V2*5, row=DF$V4*3, site=DF$V6)
> DF1
lat lon column row site
1 13 2 65 6 As
2 14 3 70 9 Ad
Unless you need to use a function I would use a vectorised solution. First, I'd tidy up your data frame:
cor <- read.table("C:\\Data\\AMS.txt", sep="") # note <- not =
require("dplyr")
cor <- select(cor, -V1)
cor <- select(cor, -V3)
cor <- select(cor, -V5)
colnames(cor) <- c("lat", "long", "site")
Then I'd simply create a new variable for column and row:
cor$column <- cor$lat * 5
cor$row <- cor$long * 3
Yielding:
cor
# lat long site column row
# 1 13 2 As 65 6
# 2 14 3 Ad 70 9
EDIT: Based on your comments and edited post you clearly have a more complex function, which I've attempted to vectorise below. The output is for a vector of 5 items for each of columns and row, so hopefully that's your expected behaviour.
kR = 6371.228 # recommend constants start with 'k'
kC = 25.067525
kNc = 1383
kNl = 586
kR0 = (kNc - 1) / 2
kS0 = (kNl - 1) / 2
kDeg2rad_cte = pi/180
cor$lamda <- cor$lon * kDeg2rad_cte
cor$phi <- cor$lat * kDeg2rad_cte
column <- round(kR0 + (kR / kC) %*% cor$lamda * cos(pi / 6)) + 1
row <- 586 - round(kS0 - (kR / kC) %*% sin(cor$phi) / cos(pi / 6))
column <- seq(min(column), max(column), by=1)
row <- seq(min(row), max(row), by=1)
column
# [1] 700 701 702 703 704
row
# [1] 360 361 362 363 364
In the process of learning. Didn't ask my first question well, so I'm trying again and doing my best to be more clear.
I'm trying to create a series of data frames for a reproducible question for my larger issue. I would like to make 4 data frames, each named differently by the year. Eventually I will merge these four data frames to explain where I am encountering my issue.
Here is the most recent solution. This runs, but instead creates a list of four data frames without any frames in the global directory.
datafrom <- list()
years <- c(2006,2008,2010,2012)
for (i in 1:length(years)) {
UniqueID <- 1:10 # <- Not all numeric - Kept as character vector
Name <- LETTERS[seq( from = 1, to = 10 )]
Entity_Type <- factor("This","That")
Data1 <- rnorm(10)
Data2 <- rnorm(10)
Data3 <- rnorm(10)
Data4 <- rnorm(10)
Year <- years[i]
datafrom[[i]] <- data.frame(UniqueID, Name, Entity_Type, Data1, Data2, Data3, Data4, Year)
}
I would like 4 separate data frames, each named datafrom2006, datafrom2008, etc.
Many thanks in advance for your patience with my learning.
I'll demonstrate a few (of many) techniques here, and I'll call them (1) brute force, (2) list-based, and (3) single long-form data.frame.
I'll add to the example the use of a function that you want to apply to each data.frame. Though contrived, it helps makes the point:
## some constants used throughout
years <- c(2006, 2008, 2010, 2012)
n <- 10
myfunc <- function(x) {
interestingPart <- x[ , grepl('^Data', colnames(x)) ]
sapply(interestingPart, mean)
}
Brute Force
Yes, you can create multiple like-named and same-structure data.frames from a loop, though it is typically frowned upon by many experienced (R?) programmers:
set.seed(42)
for (yr in years) {
tmpdf <- data.frame(UniqueID=as.character(1:n),
Name=LETTERS[1:n],
Entity_Type=factor(c('this', 'that')),
Data1=rnorm(n),
Data2=rnorm(n),
Data3=rnorm(n),
Data4=rnorm(n),
Year=yr)
assign(sprintf('datafrom%s', yr), tmpdf)
}
rm(yr, tmpdf)
ls()
## [1] "datafrom2006" "datafrom2008" "datafrom2010" "datafrom2012" "myfunc"
## [6] "n" "years"
head(datafrom2006, n=2)
## UniqueID Name Entity_Type Data1 Data2 Data3 Data4 Year
## 1 1 A this 1.3709584 1.3048697 -0.3066386 0.4554501 2006
## 2 2 B that -0.5646982 2.2866454 -1.7813084 0.7048373 2006
In order to see the results for each data.frame, one would typically (though not always) do something like this:
myfunc(datafrom2006)
## Data1 Data2 Data3 Data4
## 0.5472968 -0.1634567 -0.1780795 -0.3639041
myfunc(datafrom2008)
## Data1 Data2 Data3 Data4
## -0.02021535 0.01839391 0.53907680 -0.21787537
myfunc(datafrom2010)
## Data1 Data2 Data3 Data4
## 0.25110630 -0.08719458 0.22924781 -0.19857243
myfunc(datafrom2012)
## Data1 Data2 Data3 Data4
## -0.7949660 0.2102418 -0.2022066 -0.2458678
List-Based
set.seed(42)
datafrom <- sapply(as.character(years), function(yr) {
data.frame(UniqueID=as.character(1:n),
Name=LETTERS[1:n],
Entity_Type=factor(c('this', 'that')),
Data1=rnorm(n),
Data2=rnorm(n),
Data3=rnorm(n),
Data4=rnorm(n),
Year=yr)
}, simplify=FALSE)
str(datafrom)
## List of 4
## $ 2006:'data.frame': 10 obs. of 8 variables:
## ..$ UniqueID : Factor w/ 10 levels "1","10","2","3",..: 1 3 4 5 6 7 8 9 10 2
## ..$ Name : Factor w/ 10 levels "A","B","C","D",..: 1 2 3 4 5 6 7 8 9 10
## ..$ Entity_Type: Factor w/ 2 levels "that","this": 2 1 2 1 2 1 2 1 2 1
## ..$ Data1 : num [1:10] 1.371 -0.565 0.363 0.633 0.404 ...
## ..$ Data2 : num [1:10] 1.305 2.287 -1.389 -0.279 -0.133 ...
## ..$ Data3 : num [1:10] -0.307 -1.781 -0.172 1.215 1.895 ...
## ..$ Data4 : num [1:10] 0.455 0.705 1.035 -0.609 0.505 ...
## ..$ Year : Factor w/ 1 level "2006": 1 1 1 1 1 1 1 1 1 1
## $ 2008:'data.frame': 10 obs. of 8 variables:
## ..$ UniqueID : Factor w/ 10 levels "1","10","2","3",..: 1 3 4 5 6 7 8 9 10 2
#### ...snip...
head(datafrom[[1]], n=2)
## UniqueID Name Entity_Type Data1 Data2 Data3 Data4 Year
## 1 1 A this 1.3709584 1.3048697 -0.3066386 0.4554501 2006
## 2 2 B that -0.5646982 2.2866454 -1.7813084 0.7048373 2006
head(datafrom[['2008']], n=2)
## UniqueID Name Entity_Type Data1 Data2 Data3 Data4 Year
## 1 1 A this 0.2059986 0.32192527 -0.3672346 -1.04311894 2008
## 2 2 B that -0.3610573 -0.78383894 0.1852306 -0.09018639 2008
However, with this you can test your function performance with just one:
myfunc(datafrom[[1]])
myfunc(datafrom[['2010']])
and then run the function on all of them very simply:
lapply(datafrom, myfunc)
## $`2006`
## Data1 Data2 Data3 Data4
## 0.5472968 -0.1634567 -0.1780795 -0.3639041
## $`2008`
## Data1 Data2 Data3 Data4
## -0.02021535 0.01839391 0.53907680 -0.21787537
## $`2010`
## Data1 Data2 Data3 Data4
## 0.25110630 -0.08719458 0.22924781 -0.19857243
## $`2012`
## Data1 Data2 Data3 Data4
## -0.7949660 0.2102418 -0.2022066 -0.2458678
Long-form Data
If instead you keep all of the data in the same data.frame, using your already-defined column of Year, you can still segment it for exploring individual years:
longdf <- do.call('rbind.data.frame', datafrom)
rownames(longdf) <- NULL
longdf[c(1,11,21,31),]
## UniqueID Name Entity_Type Data1 Data2 Data3 Data4 Year
## 1 1 A this 1.3709584 1.3048697 -0.3066386 0.45545012 2006
## 11 1 A this 0.2059986 0.3219253 -0.3672346 -1.04311894 2008
## 21 1 A this 1.5127070 1.3921164 1.2009654 -0.02509255 2010
## 31 1 A this -1.4936251 0.5676206 -0.0861073 -0.04069848 2012
Simple subsets:
subset(longdf, Year == 2006), though subset has its goods and others.
by(longdf, longdf$Year, myfunc)
If using library(dplyr), try longdf %>% filter(Year == 2010) %>% myfunc()
(Side note: when trying to plot aggregate data, it's often easier when the data is in this form, especially when using ggplot2-like layering and aesthetics.)
Rationale Against "Brute Force"
In answer to your comment question, when making different variables with the same structure, it is easy to deduce that you will be doing the same thing to each of them, in turn or immediately-consecutively. In general programming principle, many try to generalize what they do so that it if it can be done once, it can be done an arbitrary number of times without (heavily) adjusting the code. For instance, compare what was necessary in applying myfunc in the two examples above.
Further, if you later want to aggregate the results from your calls to myfunc, it is more laborious in the "brute force" example (as you must capture each return and combine manually), whereas the other two techniques can use simpler summarizing functions (e.g., another lapply, or perhaps Reduce or Filter).
I'm trying to join two datasets together. Call them x and y. I believe that the ID variables in y are a subset of the ID variables in x. But not in the pure sense because I know that x contains more IDs than y but I don't know the mapping. That is, some (but not all) of the IDs in x and y can be matched 1:1.
My ultimate goal is to figure out where this 1:1 mapping fails and flag these observations. I thought merge would be the way to go but maybe not. An example is below:
id <- c(1:10, 1:100)
X1 <- rnorm(110, mean = 0, sd = 1)
year <- c("2004","2005","2006","2001","2002")
year <- rep(year, 22)
month = c("Jul","Aug","Sep","Oct","Nov","Dec","Jan","Feb","Mar","Apr")
month <- rep(month, 11)
#dataset X
x <- cbind(id, X1, month, year)
#dataset Y
id2 <- c(1:10, 200)
Y1 <- rnorm(11, mean = 0 , sd = 1)
y <- cbind(id2,Y1)
#merge on the IDs; but we get an error because when id2 == 200 in y we don't
#have a match in x
result <- merge(x, y, by.x="id", by.y = "id2", all =TRUE)
The merge threw an error because id2 == 200 had no match in the x dataset. Unfortunately, I lost the ID and all the information as well! (it should equal 200 in row 111):
tail(result)
id X1 month year Y1
106 95 -0.0748386054887876 Nov 2002 NA
107 96 0.196765325477989 Dec 2004 NA
108 97 0.527922135906927 Jan 2005 NA
109 98 0.197927230533413 Feb 2006 NA
110 99 -0.00720474886698309 Mar 2001 NA
111 <NA> <NA> <NA> <NA> -0.9664941
What's more, I get duplicate observations on the ID variable in the merged file. The id2 == 1 observation only existed once but it just copied it twice (e.g. Y1 takes on the value 1.55 twice).
head(result)
id X1 month year Y1
1 1 -0.67371266313441 Jul 2004 1.553220
2 1 -0.318666983469993 Jul 2004 1.553220
3 10 -0.608192898092431 Apr 2002 1.234325
4 10 -0.72299929212347 Apr 2002 1.234325
5 100 -0.842111221826554 Apr 2002 NA
6 11 -0.16316681842082 Jul 2004 NA
This merge has made things more complicated than I intended. I was hoping I could examine every observation in x and figure out where the id matched id2 in y and flag the ones that didn't. So I would get a new vector, call it flag, that takes on a value 1 if x$id had a match in y$id2 and zero otherwise. This way, I could know where the 1:1 mapping failed. I could potentially get some traction on this by re-coding the NAs, but what about the error that gets thrown when id2 == 200? It just discards the information.
I have tried appending by rows with no luck and it looks like I should give up merge as well, perhaps it's better to wring a loop or function to do something along these lines:
for every observation in x
id2 = which(id2) corresponds to id-month-year
flag = 1 if length of above is == 1, 0 otherwise
etc.
Hopefully this all makes sense. I'd be very grateful for any help or guidance.
If you are looking for which things in x$id are in y$id2, then you can use
x$id %in% y$id2
to get a logical vector returning matches. It does not guarantee a 1-to-1 correspondence, however; just a 1-to-many. You can then add this vector to your data frame
x$match.y <- x$id %in% y$id2
to see what rows of x have a corresponding ID in y.
To see which observations are 1-to-1, you could do something like
y$id2[duplicated(y$id2)] #vector of duplicate elements in y$id2
(x$id %in% y$id2) & !(x$id %in% y$id2[duplicated(y$id2)])
to filter out elements that appear more than once in y$id2. You can also add this to x:
x$match.y.unique <- (x$id %in% y$id2) & !(x$id %in% y$id2[duplicated(y$id2)])
The same procedure can be done for y to determine what rows of y match in x, and which ones match uniquely.
The reason your merge failed was that you gave it two different structures (one a numeric matrix and the other a character matrix) for x and y. Using cbind when data.frame should be chosen is a common strategy for failure.
> str(x)
chr [1:110, 1:4] "1" "2" "3" "4" "5" "6" "7" "8" "9" "10" "1" "2" ...
- attr(*, "dimnames")=List of 2
..$ : NULL
..$ : chr [1:4] "id" "X1" "month" "year"
> str(y)
num [1:11, 1:2] 1 2 3 4 5 6 7 8 9 10 ...
- attr(*, "dimnames")=List of 2
..$ : NULL
..$ : chr [1:2] "id2" "Y1"
If you used the data.frame function (since dataframes are what merge is supposed to be working with) it would have succeeded:
> x <- data.frame(id, X1, month, year); y <- data.frame(id2,Y1)
> str( result <- merge(x, y, by.x="id", by.y = "id2", all =TRUE) )
'data.frame': 111 obs. of 5 variables:
$ id : num 1 1 2 2 3 3 4 4 5 5 ...
$ X1 : num 1.5063 2.5035 0.7889 -0.4907 -0.0446 ...
$ month: Factor w/ 10 levels "Apr","Aug","Dec",..: 6 6 2 2 10 10 9 9 8 8 ...
$ year : Factor w/ 5 levels "2001","2002",..: 3 3 4 4 5 5 1 1 2 2 ...
$ Y1 : num 1.449 1.449 -0.134 -0.134 -0.828 ...
> tail( result <- merge(x, y, by.x="id", by.y = "id2", all =TRUE) )
id X1 month year Y1
106 96 -0.3869157 Dec 2004 NA
107 97 0.6373009 Jan 2005 NA
108 98 -0.7735626 Feb 2006 NA
109 99 -1.3537915 Mar 2001 NA
110 100 0.2626190 Apr 2002 NA
111 200 NA <NA> <NA> -1.509818
If you have duplicates in your 'x' argument, then you should get duplicates in the result. It's then your responsibility to use !duplicated in whatever manner you deem appropriate (either before or after the merge), but you cannot expect merge to be making decisions like that for you.
This is a follow-up question related to my previous post. Below is a more explanatory version of "what I want to do" as opposed to "how do I make this method work".
Below is code that produces a "master" database, from which, I extract elements for further use in other functions. I routinely extract elements of data based on the value of a group identification number.
Objective: I would like to be able to "wrap" the specifications that vary (like the name of the output dataframe and the groups selected) into a function that could be called.
##### generating data for example
set.seed(271828)
n.elements <- c(10,10,12,14,16,18)
group.number <- rep(1001:1006, n.elements)
element.id <- c(
seq(1,n.elements[1], 1),
seq(1,n.elements[2], 1),
seq(1,n.elements[3], 1),
seq(1,n.elements[4], 1),
seq(1,n.elements[5], 1),
seq(1,n.elements[6], 1) )
x1 <- round(rnorm(length(group.number),45, 12), digits=0)
x2 <- round(rbeta(length(group.number),2,4), digits = 2)
data.base <- as.data.frame(cbind(group.number, element.id, x1, x2))
data.base
##### data.base is representative of the large database
##### suppose I need to pull a set together made up of groups:
##### 1003, 1004, and 1001
groups.set.1 <- as.data.frame(c(1003, 1004, 1001))
bank.names <- c("group.number")
colnames(groups.set.1) <- bank.names
set.sort <- matrix(seq(1,nrow(groups.set.1),1))
sort.set.1 <- cbind(groups.set.1, set.sort)
set.1 <- as.data.frame(merge(sort.set.1, data.base,
by="group.number", all.x=TRUE))
##### this is how the dataset needs to be ordered for further use
set.1 <- set.1[order(set.1$set.sort, set.1$element.id ), ]
row.names(set.1) <- seq(nrow(set.1))
EDIT: Suppose I wanted to carry out the same task to produce set.2, where set.2 is made up of groups: 1005, 1006, and 1002. I could just copy the above code, and make the relevant changes. However, I would like to know if it is possible to specify a function so that I can pass the necessary changes to it, and have it produce the output dataframe as desired. Perhaps having a function called group.extract, where I could specify something like the following:
groups.2 <- c(1005, 1006, 1002)
group.extract(set.2, groups.2)
Based on the comments provided, it seems like a list is the way to go, and have the function call the list, where the list elements can vary.
I'd write this function using match, as follows. Here I've hard-coded the names of the columns of the input data frame to use for matching and sorting; those could also be added as optional inputs. The column order of the output is slightly different from yours but that could be easily changed as well.
getset <- function(g, d=data.base) {
d$set.sort <- match(d$group.number, g)
d <- d[!is.na(d$set.sort),]
d <- d[order(d$set.sort, d$element.id),]
rownames(d) <- NULL
d
}
You'd use it almost exactly like you propose:
> set.1 <- getset(c(1003, 1004, 1001))
> head(set.1)
group.number element.id x1 x2 set.sort
1 1003 1 60 0.32 1
2 1003 2 28 0.18 1
3 1003 3 42 0.47 1
4 1003 4 43 0.08 1
5 1003 5 45 0.31 1
6 1003 6 27 0.48 1
Though if you have multiple groups to get, putting them in a list and using lapply would be the way to go.
> groups <- list(group1=c(1003, 1004, 1001), group2=c(1005,1006,1002))
> sets <- lapply(groups, getset)
> lapply(sets, head)
$group1
group.number element.id x1 x2 set.sort
1 1003 1 60 0.32 1
2 1003 2 28 0.18 1
3 1003 3 42 0.47 1
4 1003 4 43 0.08 1
5 1003 5 45 0.31 1
6 1003 6 27 0.48 1
$group2
group.number element.id x1 x2 set.sort
1 1005 1 27 0.20 1
2 1005 2 51 0.48 1
3 1005 3 49 0.43 1
4 1005 4 48 0.20 1
5 1005 5 33 0.37 1
6 1005 6 41 0.50 1
Hopefully reviewing the code in SO can effect a cure for what appears to be a moderately severe case of post-SAS-ism. I think this is a more R-ish way of doing this:
pick <- subset(data.base, group.number %in% c(1003, 1004, 1001) )
idx <- match(pick$group.number, c(1003, 1004, 1001) )
pick[ order(idx, pick$element.id), ]
#---------
group.number element.id x1 x2
21 1003 1 60 0.32
22 1003 2 28 0.18
23 1003 3 42 0.47
24 1003 4 43 0.08
25 1003 5 45 0.31
26 1003 6 27 0.48
snipped----
And this would be that strategy incorporated into a function:
grp.ext.srt <- function(dfrm, grpid) { pick <- dfrm[ group.number %in% grpid , ]
idx <- match(pick$group.number, grpid ) ; rownames(pick) <- NULL
return(pick[ order(idx, pick$element.id), ])
}