Difficulty combining lists, characters, and numbers into data frame - r
I'm lost on how to combine my data into a usable data frame. I have a list of lists of character and number vectors Here is a working example of my code so far:
remove(list=ls())
# Headers for each of my column names
headers <- c("name","p","c","prophylaxis","control","inclusion","exclusion","conversion excluded","infection criteria","age criteria","mean age","age sd")
#_name = author and year
#_p = no. in experimental arm.
#_c = no. in control arm
#_abx = antibiotic used
#_con = control used
#_inc = inclusion criteria
#_exc = exclusion criteria
#_coexc = was conversion to open excluded?
#_infxn = infection criteria
#_agecrit = age criteria
#_agemean = mean age of study
#_agesd = sd age of study
# Passos 2016
passos_name <- c("Passos","2016")
passos_p <- 50
passos_c <- 50
passos_abx <- "cefazolin 1g at induction"
passos_con <- "none"
passos_inc <- c("elective LC","symptomatic cholelithiasis","low risk")
passos_exc <- c("renal impairment","hepatic impairment","immunosuppression","regular steroid use","antibiotics within 48H","acute cholecystitis","choledocolithiasis")
passos_coexc <- TRUE
passos_infxn <- c("temperature >37.8C","tachycardia","asthenia","local pain","local purulence")
passos_agecrit <- NULL
passos_agemean <- 48
passos_agesd <- 13.63
passos <- list(passos_name,passos_p,passos_c,passos_abx,passos_con,passos_inc,passos_exc,passos_coexc,passos_infxn,passos_agecrit,passos_agemean,passos_agesd)
names(passos) <- headers
# Darzi 2016
darzi_name <- c("Darzi","2016")
darzi_p <- 182
darzi_c <- 247
darzi_abx <- c("cefazolin 1g 30min prior to induction","cefazolin 1g 6H after induction","cefazolin 1g 12H after induction")
darzi_con <- "NaCl"
darzi_inc <- c("elective LC","first time abdominal surgery")
darzi_exc <- c("antibiotics within 7 days","immunosuppression","acute cholecystitis","choledocolithiasis","cholangitis","obstructive jaundice",
"pancreatitis","previous biliary tract surgery","previous ERCP","DM","massive intraoperative bleeding","antibiotic allergy","major thalassemia",
"empyema")
darzi_coexc <- TRUE
darzi_infxn <- c("temperature >38C","local purulence","intra-abdominal collection")
darzi_agecrit <- c(">18", "<75")
darzi_agemean <- 43.75
darzi_agesd <- 13.30
darzi <- list(darzi_name,darzi_p,darzi_c,darzi_abx,darzi_con,darzi_inc,darzi_exc,darzi_coexc,darzi_infxn,darzi_agecrit,darzi_agemean,darzi_agesd)
names(darzi) <- headers
# Matsui 2014
matsui_name <- c("Matsui","2014")
matsui_p <- 504
matsui_c <- 505
matsui_abx <- c("cefazolin 1g at induction","cefazolin 1g 12H after induction","cefazolin 1g 24H after induction")
matsui_con <- "none"
matsui_inc <- "elective LC"
matsui_exc <- c("emergent","concurrent surgery","regular insulin use","regular steroid use","antibiotic allergy","HD","antibiotics within 7 days","hepatic impairment","chemotherapy")
matsui_coexc <- FALSE
matsui_infxn <- c("local purulence","intra-abdominal collection","distant infection","temperature >38C")
matsui_agecrit <- ">18"
matsui_agemean <- NULL
matsui_agesd <- NULL
matsui <- list(matsui_name,matsui_p,matsui_c,matsui_abx,matsui_con,matsui_inc,matsui_exc,matsui_coexc,matsui_infxn,matsui_agecrit,matsui_agemean,matsui_agesd)
names(matsui) <- headers
# Find unique exclusion critieria in order to create the list of all possible levels
exc <- ls()[grepl("_exc",ls())]
exclist <- sapply(exc,get)
exc.levels <- unique(unlist(exclist,use.names = F))
# Find unique inclusion critieria in order to create the list of all possible levels
inc <- ls()[grepl("_inc",ls())]
inclist <- sapply(inc,get)
inc.levels <- unique(unlist(inclist,use.names = F))
# Find unique antibiotics order to create the list of all possible levels
abx <- ls()[grepl("_abx",ls())]
abxlist <- sapply(abx,get)
abx.levels <- unique(unlist(abxlist,use.names = F))
# Find unique controls in order to create the list of all possible levels
con <- ls()[grepl("_con",ls())]
conlist <- sapply(con,get)
con.levels <- unique(unlist(conlist,use.names = F))
# Find unique age critieria in order to create the list of all possible levels
agecrit <- ls()[grepl("_agecrit",ls())]
agecritlist <- sapply(agecrit,get)
agecrit.levels <- unique(unlist(agecritlist,use.names = F))
I have been struggling with:
1) Turn each of the _exc, _inc, _abx, _con, _agecrit lists into factors using the levels generated at the end of the code block. I have been trying to use a for loop such as:
for (x in exc) {
as.name(x) <- factor(get(x),levels = exc.levels)
}
This only creates a variable, x, that stores the last parsed list as a factor.
2) Combine all of my data into a data frame formatted as such:
name, p, c, prophylaxis, control, inclusion, exclusion, conversion excluded, infection criteria, age criteria, mean age, age sd
"Passos 2016", 50, 50, "cefazolin 1g at induction", "none", ["elective LC","symptomatic cholelithiasis","low risk"], ["renal impairment","hepatic impairment","immunosuppression","regular steroid use","antibiotics within 48H","acute cholecystitis","choledocolithiasis"], TRUE, ["temperature >37.8C","tachycardia","asthenia","local pain","local purulence"], NULL, 48, 13.63
...
# [] = factors
# columns correspond to each studies variables (i.e. passos_name, passos_p, passos_c, etc..)
# rows correspond to each study (i.e., passos, darzi, matsui)
I have tried various solutions on StackOverflow, but have not found any that work; for example:
studies <- list(passos,darzi,matsui,ruangsin,turk,naqvi,hassan,sharma,uludag,yildiz,kuthe,koc,maha,tocchi,higgins,mahmoud,kumar)
library(data.table)
rbindlist(lapply(studies,as.data.frame.list))
I suspect my data may not be exactly amenable to a data frame? Primarily because of trying to store a list of factors in a column. Is that allowed? If not, how is this type of data normally stored? My goal is to be able to meaningfully compare these various criterion across studies.
This is too long for a comment, so I turn it into an "answer":
To start with, have a look at what happens here:
data.frame(name = "Passos, 2016", p = 50)
name p
1 Passos, 2016 50
data.frame(name = c("Passos", "2016"), p = 50)
name p
1 Passos 50
2 2016 50
In the first one, we created a dataframe with the column "name" which contained one entry "Passos, 2016", i.e. one character containing both pieces of information, and the column "p". All fine. Now, in the second version, I specified the column "name" as you did above, using c(Passos, 2016). This is a two-element vector, and hence we get two rows in the dataframe: one with name Passos, one with name 2016, and the column p gets recycled.
Clearly, the latter is probably not what you intended. But it works anyway because R just recycles the shorter vector. Now, what do you think happens if I add a vector that contains three elements?
And this highlights the main issue with what you are doing: you are trying to get a dataframe from many vectors with different lengths. Now, in some cases this is fine if you want the shorter vector to be repeated (in R speech, we call this "recycled"), but it does not look like something you want to do here.
So, my recommendation would be this: try to imagine a matrix and make sure you understand what each element (row and column) is supposed to be. Then specify your data accordingly. If in doubt, look up "tidy data".
Related
Web Scrape titles in r
I am trying to make a function get_CIDname() Each chemical compound has a designated CID, Compound ID, from PubChem's chemical database. For example, Acetic Acid is 176, and water is 962 I have a dataframe with a column of these CIDs, and some other character value columns. I would like to mutate a new column that names each CID as the column's title name from the site. Example: i.e. all instances of 962 in this identifier column is replaced with 'Water', and all instances of 176 is replaced with 'Acetic Acid', the main name on the website https://pubchem.ncbi.nlm.nih.gov/compound/CID example dataset: df <- data.frame("Compound" = c(176,29096,6341,8914,5366204,98464,11572,9231,535144,15669393,1738127,1738124), "Value" = rnorm(12, mean = 500000, sd = 600000)) desired output: df <- data.frame("Compound" = c(176,29096,6341,8914,5366204,98464,11572,9231,535144,15669393,1738127,1738124), "Value" = rnorm(12, mean = 500000, sd = 600000), Match = c("Acetic Acid", "Dihydromyrcenol", etc....)) Currently, I have: get_CIDname <- function(CID){ read_html(paste0("https://pubchem.ncbi.nlm.nih.gov/compound/", CID)) } but do not know how to decipher the HTML of the PubChem's website. What comes next? What is this type of syntax/programming called?
We can use their PUG REST API to extract the JSON datafiles and link the CID to the compound title. #libraries library(jsonlite) library(data.table) #data df <- data.frame("Compound" = c(10413, 176,29096,6341,8914,5366204,98464,11572,9231,535144,15669393,1738127,1738124), "Value" = rnorm(13, mean = 500000, sd = 600000)) #set to data.table df <- as.data.table(df) #set up progressbar pb <- txtProgressBar(min = 0, max = nrow(df), style = 3) #loop through df rows for(i in 1:nrow(df)){ #update progressbar setTxtProgressBar(pb, i) #extract compound data data <- fromJSON(readLines(paste0("https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/data/compound/", df[i,]$Compound, "/JSON/?response_type=save&response_basename=compound_CID_", df[i,]$Compound))) #extract title compound_title <- data$Record$RecordTitle #add to df df[i, name := compound_title] } head(df) Compound Value name 1: 10413 898404.7 4-Hydroxybutanoic acid 2: 176 174150.1 Acetic Acid 3: 29096 516514.0 Dihydromyrcenol 4: 6341 499010.7 Ethylamine 5: 8914 783220.9 Nonan-1-ol 6: 5366204 217092.8 (Z)-1-Methoxy-2-buten If you have duplicates of Compound in your dataset it might be faster to loop through unique compounds, i.e. for(i in unique(df$compounds) and adjust the code accordingly. Edit: They note in the description of the PUG REST API that PUG REST is not designed for very large volumes (millions) of requests. They ask that any script or application does not make more than 5 requests per second, in order to avoid overloading the PubChem servers. See https://pubchemdocs.ncbi.nlm.nih.gov/pug-rest Something to keep in mind.
Looping row numbers from one dataframe to create new data using logical operations in R
I would like to extract a dataframe that shows how many years it takes for NInd variable (dataset p1) to recover due to some culling happening, which is showed in dataframe e1. I have the following datasets (mine are much bigger, but just to give you something to play with): # Dataset 1 Batch <- c(2,2,2,2,2,2,2,2,2,2) Rep <- c(0,0,0,0,0,0,0,0,0,0) Year <- c(0,0,1,1,2,2,3,3,4,4) RepSeason <- c(0,0,0,0,0,0,0,0,0,0) PatchID <- c(17,25,19,16,21,24,23,20,18,33) Species <- c(0,0,0,0,0,0,0,0,0,0) Selected <- c(1,1,1,1,1,1,1,1,1,1) Nculled <- c(811,4068,1755,449,1195,1711,619,4332,457,5883) e1 <- data.frame(Batch,Rep,Year,RepSeason,PatchID,Species,Selected,Nculled) # Dataset 2 Batch <- c(2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2) Rep <- c(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0) Year <- c(0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2) RepSeason <- c(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0) PatchID <- c(17,25,19,16,21,24,23,20,18,33,17,25,19,16,21,24,23,20,18,33,17,25,19,16,21,24,23,20,18,33) Ncells <- c(6,5,6,4,4,5,6,5,5,5,6,5,6,4,4,5,6,7,3,5,4,4,3,3,4,4,5,5,6,4) Species <- c(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0) NInd <- c(656,656,262,350,175,218,919,218,984,875,700,190,93,127,52,54,292,12,43,68,308,1000,98,29,656,656,262,350,175,300) p1 <- data.frame(Batch, Rep, Year, RepSeason, PatchID, Ncells, Species, NInd) The dataset called e1 shows only those year where some culled happened to the population on specific PatchID. I have created the following script that basically use each row from e1 to create a Recovery number. Maybe there is an easier way to get to the end, but this is the one I managed to get... When you run this, you are working on ONE row of e1, so we focus on the first PatchID encounter and then do some calculation to match that up with p1, and finally I get a number named Recovery. Now, the thing is my dataframe has 50,000 rows, so doing this over and over looks quite tedious. So, that's where I thought a loop may be useful. But have tried and no luck on how to make it work at all... #here is where I would like the loop e2 <- e1[1,] # Trial for one row only # but the idea is having here a loop that keep doing of comes next for each row e3 <- e2 %>% select(1,2,4,5) p2 <- p1[,c(1,2,4,5,3,6,7,8)] # Re-order row2 <- which(apply(p2, 1, function(x) return(all(x == e3)))) p3 <- p1 %>% slice(row2) # all years with that particular patch in that particular Batch #How many times was this patch cull during this replicate? e4 <- e2[,c(1,2,4,5,3,6,7,8)] e4 <- e4 %>% select(1,2,3,4) c_batch <- e1[,c(1,2,4,5,3,6,7,8)] row <- which(apply(c_batch, 1, function(x) return(all(x == e4)))) c4 <- c_batch %>% slice(row) # Number of year to recover to 95% that had before culled c5 <- c4[1,] # extract the first time was culled c5 <- c5 %>% select(1:5) row3 <- which(apply(p2, 1, function(x) return(all(x == c5)))) Before <- p2 %>% slice(row3) NInd <- Before[,8] # Before culling number of individuals Year2 <- Before[,5] # Year number where first culling happened (that actually the number corresponds to individuals before culling, as the Pop file is developed during reproduction, while Cull file is developed after!) Percent <- (95*NInd)/100 # 95% recovery we want to achieve would correspond to having 95% of NInd BEFORE culled happened (Year2) After <- p3 %>% filter(NInd >= Percent & Year > Year2) # Look rows that match number of ind and Year After2 <- After[1,] # we just want the first year where the recovery was successfully achieved Recovery <- After2$Year - Before$Year # no. of years to reach 95% of the population immediately before the cull I reckon that the end would have to change somehow to to tell R that we are creating a dataframe with the Recovery, something like: Batch <- c(1,1,2,2) Rep <- c(0,0,0,0) PatchID <- c(17,25,30,12) Recovery <- c(1,2,1,5) Final <- data.frame(Batch, Rep, PatchID, Recovery) Would that be possible? OR this is just too mess-up and I may should try a different way?
Does the following solve the problem correectly? I have first added a unique ID to your data.frames to allow matching of the cull and population files (this saves most of you complicated look-up code): # Add a unique ID for the patch/replicate etc. (as done in the example code) e1$RepID = paste(e1$Batch, e1$Rep, e1$RepSeason, e1$PatchID, sep = ":") p1$RepID = paste(p1$Batch, p1$Rep, p1$RepSeason, p1$PatchID, sep = ":") If you want a quick overview of the number of times each patch was culled, the new RepID makes this easy: # How many times was each patch culled? table(p1$RepID) Then you want a loop to check the recovery time after each cull. My solutions uses an sapply loop (which also retains the RepIDs so you can match to other metadata later): sapply(unique(e1$RepID), function(rep_id){ all_cull_events = e1[e1$RepID == rep_id, , drop = F] first_year = order(all_cull_events$Year)[1] # The first cull year (assuming data might not be in temporal order) first_cull_event = all_cull_events[first_year, ] # The row corresponding to the first cull event population_counts = p1[p1$RepID == first_cull_event$RepID, ] # The population counts for this plot/replicate population_counts = population_counts[order(population_counts$Year), ] # Order by year (assuming data might not be in temporal order) pop_at_first_cull_event = population_counts[population_counts$Year == first_cull_event$Year, "NInd"] population_counts_after_cull = population_counts[population_counts$Year > first_cull_event$Year, , drop = F] years_to_recovery = which(population_counts_after_cull$NInd >= (pop_at_first_cull_event * .95))[1] # First year to pass 95% threshold return(years_to_recovery) }) 2:0:0:17 2:0:0:25 2:0:0:19 2:0:0:16 2:0:0:21 2:0:0:24 2:0:0:23 2:0:0:20 2:0:0:18 2:0:0:33 1 2 1 NA NA NA NA NA NA NA (The output contains some NAs because the first cull year was outside the range of population counts in the data you gave us) Please check this against your expected output though. There were some aspects of the question and example code that were not clear (see comments).
R: Create Stock Indicator from OHLC data
I have OHLC (Open/High/Low/Close) data which we can get using Finance API and all. I want to create a target indicator (-1,0,1) on which I will build stock classification model. To create this target variable. I need to create another indicator, log(tomorrow's CLOSE/today's CLOSE) Which will give me value in (-inf to inf). Now, I want to create labels=c(-1, 0, 1) from breaks=c(-Inf, range_start, range_end, Inf) of log(tomorrow's CLOSE/today's CLOSE). My first question is to create this target variable without looking into the future data, as my formula log(tomorrow's CLOSE/today's CLOSE) looks into the future, which is wrong, I want to shift the dataframe/inputs backward by one row and treat today as tomorrow and so on. and then, calculate the target category, based on range_start, range_end and breaks I will define, the -1, 0,1 . My 2nd question is how can i define it in best manner, this value, I am taking this as -0.0015,0.0015 as of now. need some comments and suggestions here, thanks. masterDF_close <- masterDF %>% dplyr::select('Date', 'Close') # create a one-row matrix the same length as data temprow <- matrix(c(rep.int(NA,length(masterDF))),nrow=1,ncol=length(masterDF)) # make it a data.frame and give cols the same names as data newrow <- data.frame(temprow) colnames(newrow) <- colnames(masterDF) # rbind the empty row to data masterDF <- rbind(newrow,masterDF) ###View(masterDF) temprow2 <- matrix(c(rep.int(NA,length(masterDF_close))),nrow=1,ncol=length(masterDF_close)) # make it a data.frame and give cols the same names as data newrow2 <- data.frame(temprow2) colnames(newrow2) <- colnames(masterDF_close) # rbind the empty row to data masterDF_close <- rbind(masterDF_close, newrow2) masterDF['Close_unshifted'] = masterDF_close$Close ###View(masterDF) # Shifting data backwards, assuming today Close as tomorrow Close and yesterday Close as today Close # close <- masterDF$Close # lead_close <- lag(close, k = -1) # # close[1:10] # lead_close[1:10] # # log(close/lead_close) # # plot(log(close/lead_close)) masterDF['TargetIndicator'] <- log(masterDF$Close_unshifted/masterDF$Close) ###View(masterDF) masterDF = masterDF[-1,] masterDF$TargetIndicator[is.na(masterDF$TargetIndicator)] <- 0 masterDF_ <- masterDF %>% mutate(category=cut(TargetIndicator, breaks=c(-Inf, range_start, range_end, Inf), labels=c(-1, 0, 1))) These are two operations, I am doing on the code.
Different methods to expand R data
I have the following data, and I would like to expand it. For example, if June has two successes, and one failure, my dataset should look like: month | is_success ------------------ 6 | T 6 | T 6 | F Dataset is as follows: # Months from July to December months <- 7:12 # Number of success (failures) for each month successes <- c(11,22,12,7,6,13) failures <- c(20,19,11,16,13,10) A sample solution is as follows: dataset<-data.frame() for (i in 1:length(months)) { dataset <- rbind(dataset,cbind(rep(months[i], successes[i]), rep(T, successes[i]))) dataset <- rbind(dataset,cbind(rep(months[i], failures[i]), rep(F, failures[i]))) } names(dataset) <- c("months", "is_success") dataset[,"is_success"] <- as.factor(dataset[,"is_success"]) Question: What are the different ways to rewrite this code? I am looking for a comprehensive solution with different but efficient ways (matrix, loop, apply). Thank you!
Here is one way with rep. Create a dataset with 'months' and 'is_success' based on replication of 1 and 0. Then replicate the rows by the values of 'successes', 'failures', order if necessary and set the row names to 'NULL' d1 <- data.frame(months, is_success = factor(rep(c(1, 0), each = length(months)))) d2 <- d1[rep(1:nrow(d1), c(successes, failures)),] d2 <- d2[order(d2$months),] row.names(d2) <- NULL Now, we check whether this is equal to the data created from for loop all.equal(d2, dataset, check.attributes = FALSE) #[1] TRUE Or as #thelatemail suggested, 'd1' can be created with expand.grid d1 <- expand.grid(month=months, is_success=1:0)
using mapply you can try this: createdf<-function(month,successes,failures){ data.frame(month=rep(x = month,(successes+failures)), is_success=c(rep(x = T,successes), rep(x = F,failures)) ) } Now create a list of required data.frames: lofdf<-mapply(FUN = createdf,months,successes,failures,SIMPLIFY = F) And then combine using the plyr package's ldply function: resdf<-ldply(lofdf,.fun = data.frame)
Merging two rows of two datsets with different length using R
I have problems by merging two dataframes with different length. To make it as easy as possible the datasets: Dataset A - Persons http://pastebin.com/HbaeqACi Dataset B - Waterfeatures: http://pastebin.com/UdDvNtHs Dataset C - City: http://pastebin.com/nATnkMRk I have some R-Code , which is not relevant for my problem, but I will paste it completely, so you have exactly the same situation: require(fossil) library(fossil) #load data persons = read.csv("person.csv", header = TRUE, stringsAsFactors=FALSE) water = read.csv("water.csv", header =TRUE, stringsAsFactors=FALSE) city = read.csv("city.csv", header =TRUE) #### calculate distance # Generate unique coordinates dataframe UniqueCoordinates <- data.frame(unique(persons[,4:5])) UniqueCoordinates$Id <- formatC((1:nrow(UniqueCoordinates)), width=3,flag=0) #Generate a function that looks for the closest waterfeature for each id coordinates and calculate/save the distance NearestW <- function(id){ tmp <- UniqueCoordinates[UniqueCoordinates$Id==id, 1:2] WaterFeatures <- rbind(tmp,water[,2:3]) disnw <- earth.dist(WaterFeatures, dist=TRUE)[1:(nrow(WaterFeatures)-1)] disnw <- min(disnw) disnw <- data.frame(disnw, WaterFeature=tmp) return(disnw) } # apply distance calculation function to each id and the merge CoordinatesWaterFeature <- ldply(UniqueCoordinates$Id, NearestW) persons <- merge(persons, CoordinatesWaterFeature, by.x=c(4,5), by.y=c(2,3)) Now I want to copy the calculated distance to the city dataset. I've tried to use merge (both datasets have the city attribute) and the persons only contains the cities from the city dataset. city_all_parameters = city city_all_parameters = merge(city_all_parameters, persons[,c("city", "disnw")], all=TRUE) Unfortunately this is not the outcome, which I want to have. I have 164 rows, but I only want to have these 5 rows + the variable disnw and it's corresponding value. I've tried it out with rbind as well, but there I get the error: "Error in rbind(deparse.level, ...) : numbers of columns of arguments do not match" Any tip, how to solve this problem?
Your code works as you intended, but I wanted to show you a more elegant way to do it in base. I have commented the code: library(fossil) # If you want to use pastebin, you can make it easy to load in for us like this: # But I recommend using dput(persons) and pasting the results in. persons = read.csv("http://pastebin.com/raw.php?i=HbaeqACi", header = TRUE, stringsAsFactors=FALSE) water = read.csv("http://pastebin.com/raw.php?i=UdDvNtHs", header =TRUE, stringsAsFactors=FALSE) city = read.csv("http://pastebin.com/raw.php?i=nATnkMRk", header =TRUE) # Use column names instead of column indices to clarify your code UniqueCoordinates <- data.frame(unique(persons[,c('POINT_X','POINT_Y')])) # I didn't understand why you wanted to format the Id, # but you don't need the Id in this code # UniqueCoordinates$Id <- formatC((1:nrow(UniqueCoordinates)), width=3,flag=0) # Instead of calculating the pairwise distance between all # the water points everytime, use deg.dist with mapply: UniqueCoordinates$disnw <- mapply(function(x,y) min(deg.dist(long1=x,lat1=y, long2=water$POINT_X, lat2=water$POINT_Y)), UniqueCoordinates$POINT_X, UniqueCoordinates$POINT_Y) persons <- merge(UniqueCoordinates,persons) # I think this is what you wanted... unique(persons[c('city','disnw')]) # city disnw # 1 City E 6.4865635 # 20 City A 1.6604204 # 69 City B 0.9893909 # 113 City D 0.6001968 # 148 City C 0.5308953 # If you want to merge to the city merge(persons,city,by='city')