Merging two rows of two datsets with different length using R - r

I have problems by merging two dataframes with different length.
To make it as easy as possible the datasets:
Dataset A - Persons
http://pastebin.com/HbaeqACi
Dataset B - Waterfeatures:
http://pastebin.com/UdDvNtHs
Dataset C - City:
http://pastebin.com/nATnkMRk
I have some R-Code , which is not relevant for my problem, but I will paste it completely, so you have exactly the same situation:
require(fossil)
library(fossil)
#load data
persons = read.csv("person.csv", header = TRUE, stringsAsFactors=FALSE)
water = read.csv("water.csv", header =TRUE, stringsAsFactors=FALSE)
city = read.csv("city.csv", header =TRUE)
#### calculate distance
# Generate unique coordinates dataframe
UniqueCoordinates <- data.frame(unique(persons[,4:5]))
UniqueCoordinates$Id <- formatC((1:nrow(UniqueCoordinates)), width=3,flag=0)
#Generate a function that looks for the closest waterfeature for each id coordinates and calculate/save the distance
NearestW <- function(id){
tmp <- UniqueCoordinates[UniqueCoordinates$Id==id, 1:2]
WaterFeatures <- rbind(tmp,water[,2:3])
disnw <- earth.dist(WaterFeatures, dist=TRUE)[1:(nrow(WaterFeatures)-1)]
disnw <- min(disnw)
disnw <- data.frame(disnw, WaterFeature=tmp)
return(disnw)
}
# apply distance calculation function to each id and the merge
CoordinatesWaterFeature <- ldply(UniqueCoordinates$Id, NearestW)
persons <- merge(persons, CoordinatesWaterFeature, by.x=c(4,5), by.y=c(2,3))
Now I want to copy the calculated distance to the city dataset. I've tried to use merge (both datasets have the city attribute) and the persons only contains the cities from the city dataset.
city_all_parameters = city
city_all_parameters = merge(city_all_parameters, persons[,c("city", "disnw")], all=TRUE)
Unfortunately this is not the outcome, which I want to have. I have 164 rows, but I only want to have these 5 rows + the variable disnw and it's corresponding value.
I've tried it out with rbind as well, but there I get the error:
"Error in rbind(deparse.level, ...) : numbers of columns of arguments do not match"
Any tip, how to solve this problem?

Your code works as you intended, but I wanted to show you a more elegant way to do it in base. I have commented the code:
library(fossil)
# If you want to use pastebin, you can make it easy to load in for us like this:
# But I recommend using dput(persons) and pasting the results in.
persons = read.csv("http://pastebin.com/raw.php?i=HbaeqACi", header = TRUE, stringsAsFactors=FALSE)
water = read.csv("http://pastebin.com/raw.php?i=UdDvNtHs", header =TRUE, stringsAsFactors=FALSE)
city = read.csv("http://pastebin.com/raw.php?i=nATnkMRk", header =TRUE)
# Use column names instead of column indices to clarify your code
UniqueCoordinates <- data.frame(unique(persons[,c('POINT_X','POINT_Y')]))
# I didn't understand why you wanted to format the Id,
# but you don't need the Id in this code
# UniqueCoordinates$Id <- formatC((1:nrow(UniqueCoordinates)), width=3,flag=0)
# Instead of calculating the pairwise distance between all
# the water points everytime, use deg.dist with mapply:
UniqueCoordinates$disnw <- mapply(function(x,y) min(deg.dist(long1=x,lat1=y,
long2=water$POINT_X,
lat2=water$POINT_Y)),
UniqueCoordinates$POINT_X,
UniqueCoordinates$POINT_Y)
persons <- merge(UniqueCoordinates,persons)
# I think this is what you wanted...
unique(persons[c('city','disnw')])
# city disnw
# 1 City E 6.4865635
# 20 City A 1.6604204
# 69 City B 0.9893909
# 113 City D 0.6001968
# 148 City C 0.5308953
# If you want to merge to the city
merge(persons,city,by='city')

Related

looping r-package "seg" function -with changing data selection

I am calculating the dissimilarity index of several groups compared to the total population with the function "seg" from the identically named package.
The data consists of about 450 rows, each a different district, and around 20 columns (groups that may be segregated). The values are the number of people from respective group living in respective district. Here are the first few rows of my csv file:
Region,Germany,EU15 without Germany,Poland,Former Yugoslavia and successor countries,Former Soviet Union and successor countries,Turkey,Arabic states,West Afrika,Central Afrika,East Afrika,North America,Central America and the Carribean,South America,East and Central Asia,South and Southeast Asia - excluding Vietnam,Australia and Oceania,EU,Vietnam,Non EU Europe,Total Population
1011101,1370,372,108,35,345,91,256,18,6,3,73,36,68,272,98,3,1979,19,437,3445
1011102,117,21,6,0,0,0,6,0,0,0,7,0,6,0,7,0,156,0,3,188
1011103,2180,482,181,102,385,326,358,48,12,12,73,24,75,175,129,12,3152,34,795,5159
Since the seg function only works with two columns as input, my current code to create a table with the index for all groups looks like this:
DI_table <- as.data.frame(0)
DI_table[1,1] <- print (seg(data =dfplrcountrygroups2019[, c( "Germany", "Total.Population")]))
DI_table[1,2] <- print (seg(data =dfplrcountrygroups2019[, c( colnames(dfplrcountrygroups2019)[3], "Total.Population")]))
DI_table[1,3] <- print (seg(data =dfplrcountrygroups2019[, c( colnames(dfplrcountrygroups2019)[4], "Total.Population")]))
DI_table[1,4] <- print (seg(data =dfplrcountrygroups2019[, c( colnames(dfplrcountrygroups2019)[5], "Total.Population")]))
# and so on...
colnames(DI_table)<- (colnames(dfplrcountrygroups2019[2:20]))
Works well, but a hassle to recode every time I change something with my data and I would like to use this method for other datasets too.
I thought I might try something like below but the seg function did not consider it a selection of two columns.
for (i in colnames(dfplrcountrygroups2019)) {
di_matrix [i] <- seg(data =dfplrcountrygroups2019[, c( "i", "Total.Population")])
}
Error in [.data.frame(dfplrcountrygroups2019, , c("i",
"Total.Population")) : undefined columns selected
I also thought of the apply function but not sure how to make it work so it repeats itself while just changing the column where "Germany" is in the example. How do I make the selection of columns change for each time I repeat the seg function?
my_function <- seg(data =dfplrcountrygroups2019[, c("Germany", "Total.Population")])
apply(X = dfplrcountrygroups2019,
FUN = my_function,
MARGIN = 2
)
Error in get(as.character(FUN), mode = "function", envir = envir) :
object 'my_function' of mode 'function' was not found
The seg package's functions such as dissim (seg::seg is being deprecated in its favor) have a specific expected data format. From the docs:
data - a numeric matrix or data frame with two columns that represent mutually exclusive population groups (e.g., Asians and non-Asians). If more than two columns are given, only the first two will be used for computing the index.
To get a data frame of the d values seg::dissim returns, where each column is a region's dissimilarity index, you can iterate over the columns, making a temporary data frame and calculating the index. Because the data you're starting with isn't made up of mutually-exclusive categories, you'll have to subtract each population from the total population column to get a not-X counterpart for each group X.
A base R option with sapply will return a named list, which you can then convert into a data frame.
di_table <- sapply(names(dat)[2:20], function(col) {
tmp_df <- dat[col]
tmp_df$other <- dat$Total.Population - dat[col]
seg::dissim(data = tmp_df)$d
}, simplify = FALSE)
as.data.frame(di_table)
#> Germany EU15.without.Germany Poland
#> 1 0.03127565 0.03989693 0.02770549
#> Former.Yugoslavia.and.successor.countries
#> 1 0.160239
#> Former.Soviet.Union.and.successor.countries Turkey Arabic.states West.Afrika
#> 1 0.08808277 0.2047 0.02266828 0.1415519
#> Central.Afrika East.Afrika North.America Central.America.and.the.Carribean
#> 1 0.08004711 0.213581 0.1116014 0.2095969
#> South.America East.and.Central.Asia
#> 1 0.08486598 0.2282734
#> South.and.Southeast.Asia...excluding.Vietnam Australia.and.Oceania EU
#> 1 0.0364721 0.213581 0.04394527
#> Vietnam Non.EU.Europe
#> 1 0.05505789 0.06624686
A couple tidyverse options: you can use purrr functions to do something like above in one step.
dat[2:20] %>%
purrr::map(~data.frame(value = ., other = dat$Total.Population - .)) %>%
purrr::map_dfc(~seg::dissim(data = .)$d)
# same output
Or with reshaping the data and splitting by county. This takes more steps, but might fit a larger workflow better.
library(dplyr)
dat %>%
tidyr::pivot_longer(c(-Region, -Total.Population)) %>%
mutate(other = Total.Population - value) %>%
split(.$name) %>%
purrr::map_dfc(~seg::dissim(data = .[c("value", "other")])$d)
# same output

How to get the neighboring counties of all counties based on the content of the file?

I have a shp file. I want to get the names of neighboring counties in all regions according to the latitude and longitude in the file. I found that some regions obviously have neighboring counties, but I didn’t get the neighboring counties when I ran the code. I don't know what was wrong.
library(tidyverse)
library(plyr)
library(sf)
library(readxl)
> county <-st_read('D:/county.shp',stringsAsFactors = FALSE)
> neighbor_counties <- function(subcounty){
name <- st_touches(subcounty, county)
county[unlist(name), ]$NAME
}
> output <- vector("list", nrow(county))
> names(output) <- county$NAME
> for (i in seq_len(nrow(county))) {
output[[i]] <- suppressWarnings(neighbor_counties(county[i,]))
}
> output
> head(output)
> neighbor <- output %>%
ldply(data.frame) %>%
set_names("ori_county", "neighbor_county")
Your example is not exactly reproducible, but we are lucky to have the nc.shp shapefile that ships with {sf} available.
So consider this code; it is built on sf::st_touches() function, with the county shapefile passed as argument twice (once for the touching counties, and once for the counties being touched). Sparse = TRUE makes it return a list of indexes of neighboring counties.
To find names of neighbors of a particular county you need to know the index of the county of interest, and then subset the list of neighbors accordingly. You will get indices of the neighboring counties.
As for the second part of your question (expressed in comments) = how to get from a list of indices to a data frame of neighbors - I suggest creating a function returning a data frame, and then applying it via purrr::map_dfr() to the vector of indices as starting points; consider the code provided and amend as necessary. It should give you a start...
library(sf)
shape <- st_read(system.file("shape/nc.shp", package="sf")) # included with sf package
# a list of neighbors
neighbors <- st_touches(shape, # first
shape, # second
sparse = T)
# neighbors of County Mecklenburg (as in Charlotte of Mecklenburg-Strelitz)
# index of Mecklenburg cnty
idx_strelitz <- which(shape$NAME == 'Mecklenburg')
# index of neighbors of Mecklenburg cnty
nbr_mecklenburg <- neighbors[idx_strelitz][[1]]
# names of neighbours of cnty Meckl.
shape$NAME[nbr_mecklenburg]
# [1] "Iredell" "Lincoln" "Cabarrus" "Gaston" "Union"
# a visual check
plot(st_geometry(shape))
plot(shape[idx_strelitz, ], col = "blue", add = T)
plot(shape[nbr_mecklenburg,], col = "red", add = T)
# second question: get pairs of names as a data frame
# a function returning data frame of neighbors of a given cnty
nbr_pairs <- function(idx) {
data.frame(ori_county = rep(shape$NAME[idx], length(neighbors[[idx]])),
neighbor_county = shape$NAME[neighbors[[idx]]])
}
# check - cnty Mecklemburg
nbr_pairs(idx_strelitz)
# ori_county neighbor_county
# 1 Mecklenburg Iredell
# 2 Mecklenburg Lincoln
# 3 Mecklenburg Cabarrus
# 4 Mecklenburg Gaston
# 5 Mecklenburg Union
# apply to list of indices
pairs_of_names <- purrr::map_dfr(seq_along(neighbors),
nbr_pairs)

How to efficiently calculate distance between GPS points in one dataset and GPS points in another data set using data.table

I am facing a coding (optimization) problem in R. I have a long data set with GPS coordinates (lon, lat, timestamp) and for every row I need to check whether the location is near a bus stop. I have a .csv file with all the bus stops (in the Netherlands). The GPS coordinates file is millions of entries long, but could be split if necessary. The bus stop dataset is around 5500 entries long.
Using the code and tips given on, inter alia, these pages:
1) How to efficiently calculate distance between pair of coordinates using data.table :=
2) Using a simple for loop on spatial data
3) Calculate distance between two latitude-longitude points? (Haversine formula)
4) Fastest way to determine COUNTRY from millions of GPS coordinates [R]
I was able to construct a code that works, but is (too) slow. I was wondering if someone can help me with a faster data.table() implementation or can point out where the bottle neck in my code is? Is it the spDistsN1() function, or maybe the apply and melt() functions combination? I am most comfortable in R, but open to other software (as long as it is open source).
Due to privacy concerns I cannot upload the full dataset, but this is a (small) reproducible example that is not too different from how the real data looks.
# packages:
library(data.table)
library(tidyverse)
library(sp)
# create GPS data
number_of_GPS_coordinates <- 20000
set.seed(1)
gpsdata<-as.data.frame(cbind(id=1:number_of_GPS_coordinates,
lat=runif(number_of_GPS_coordinates,50.5,53.5),
lon=runif(number_of_GPS_coordinates,4,7)))
# create some busstop data. In this case only 2000 bus stops
set.seed(1)
number_of_bus_stops <- 2000
stop<-as.data.frame(gpsdata[sample(nrow(gpsdata), number_of_bus_stops), -1]) # of course do not keep id variable
stop$lat<-stop$lat+rnorm(number_of_bus_stops,0,.0005)
stop$lon<-stop$lon+rnorm(number_of_bus_stops,0,.0005)
busdata.data<-cbind(stop, name=replicate(number_of_bus_stops, paste(sample(LETTERS, 15, replace=TRUE), collapse="")))
names(busdata.data) <- c("latitude_bustops", "longitude_bustops", "name")
Download the real bus stop data if you want, kind of hard to reproduce a random sample of this.
#temp <- tempfile()
#download.file("http://data.openov.nl/haltes/stops.csv.gz", temp) #1.7MB
#gzfile(temp, 'rt')
#busstopdata <- read.csv(temp, stringsAsFactors = FALSE)
#unlink(temp)
#bus_stops <- fread("bus_stops.csv")
#busdata.data <- busstopdata %>%
# mutate(latitude_bustops = latitude)%>%
# mutate(longitude_bustops = longitude)%>%
# dplyr::select(name, latitude_bustops, longitude_bustops)
Code I use now to calculate distances. It works but it is pretty slow
countDataPoints3 <- function(p) {
distances <- spDistsN1(data.matrix(gpsdata[,c("lon","lat")]),
p,
longlat=TRUE) # in km
return(which(distances <= .2)) # distance is now set to 200 meters
}
# code to check per data point if a bus stop is near and save this per bus stop in a list entry
datapoints.by.bustation <- apply(data.matrix(busdata.data[,c("longitude_bustops","latitude_bustops")]), 1, countDataPoints3)
# rename list entries
names(datapoints.by.bustation) <- busdata.data$name
# melt list into one big data.frame
long.data.frame.busstops <- melt(datapoints.by.bustation)
# now switch to data.table grammar to speed up process
# set data.table
setDT(gpsdata)
gpsdata[, rowID := 1:nrow(gpsdata)]
setkey(gpsdata, key = "rowID")
setDT(long.data.frame.busstops)
# merge the data, and filter non-unique entries
setkey(long.data.frame.busstops, key = "value")
GPS.joined <- merge(x = gpsdata, y = long.data.frame.busstops, by.x= "rowID", by.y= "value", all.x=TRUE)
GPS.joined.unique <- unique(GPS.joined, by="id") # mak
# this last part of the code is needed to make sure that if there are more than 1 bus stop nearby it puts these bus stop in a list
# instead of adding row and making the final data.frame longer than the original one
GPS.joined.unique2 <- setDT(GPS.joined.unique)[order(id, L1), list(L1=list(L1)), by=id]
GPS.joined.unique2[, nearby := TRUE][is.na(L1), nearby := FALSE] # add a dummy to check if any bus stop is nearby.
# makes sense:
as.tibble(GPS.joined.unique2) %>%
summarize(sum = sum(nearby))
Consider cutting using an slicing method: first cut by close latitudes and close longitudes. In this case 0.5 latitude and 0.5 longitude (which is still about a 60 km disc). We can use data.table's superb support of rolling joins.
The following takes a few milliseconds for 20,000 entries and only a few seconds for 2M entries.
library(data.table)
library(hutils)
setDT(gpsdata)
setDT(busdata.data)
gps_orig <- copy(gpsdata)
busdata.orig <- copy(busdata.data)
setkey(gpsdata, lat)
# Just to take note of the originals
gpsdata[, gps_lat := lat + 0]
gpsdata[, gps_lon := lon + 0]
busdata.data[, lat := latitude_bustops + 0]
busdata.data[, lon := longitude_bustops + 0]
setkey(busdata.data, lat)
gpsID_by_lat <-
gpsdata[, .(id), keyby = "lat"]
By_latitude <-
busdata.data[gpsdata,
on = "lat",
# within 0.5 degrees of latitude
roll = 0.5,
# +/-
rollends = c(TRUE, TRUE),
# and remove those beyond 0.5 degrees
nomatch=0L] %>%
.[, .(id_lat = id,
name_lat = name,
bus_lat = latitude_bustops,
bus_lon = longitude_bustops,
gps_lat,
gps_lon),
keyby = .(lon = gps_lon)]
setkey(busdata.data, lon)
By_latlon <-
busdata.data[By_latitude,
on = c("name==name_lat", "lon"),
# within 0.5 degrees of latitude
roll = 0.5,
# +/-
rollends = c(TRUE, TRUE),
# and remove those beyond 0.5 degrees
nomatch=0L]
By_latlon[, distance := haversine_distance(lat1 = gps_lat,
lon1 = gps_lon,
lat2 = bus_lat,
lon2 = bus_lon)]
By_latlon[distance < 0.2]
This is the function I came up with so far. #Dave2e, thanks. It is already an awful lot faster than what I had. There still is clearly room for a lot of improvement, but as it stands it is fast enough for my analysis now. I only slice by latitude and not longitude. The only reason for that is that it makes indexing and then looping over indices really easy, but more speed could be gained by also indexing by longitude. Also, in real GPS data there tend to be many duplicate values (same lon/lat, different time stamp), the code would also be more efficient if it would take this into account. Maybe I will work on that in the future.
# this app could be much faster if it would filter by duplicate GPS coordinates
check_if_close <- function(dataset1 = GPS.Utrecht.to.Gouda,
dataset2 = bus_stops,
n.splits = 500,
desired.dist = .2){
# dataset1 needs at least the columns
# - "id",
# - "device_id"
# - "latitude"
# - "longitude"
# dataset2 needs at least the columns
# - "id",
# - "name"
# - "latitude"
# - "longitude"
# these are the average coordinates of the Netherlands. A change of ,.0017 in latitude leads to a change of 189 meters
# spDistsN1(matrix(c(5.2913, 52.1326), ncol=2), matrix(c(5.2913, 52.1326+.0017), ncol=2), longlat=TRUE)*1000
# [1] 189.1604
# this means that the latitude slices we can cut (the subsection of) the Netherlands is have to be at least .0017 wide.
# if we look at the Netherlands a whole this would mean we can use max (53.5-50.5)/.0017 = 1765 slices.
# if we look only at a small subsection (because we are only looking a a single trip for example we need much less slices.
# 1) we only select the variables we need from dataset 1
dataset1 <- setDT(dataset1)[,c("id", "device_id", "latitude", "longitude")]
setnames(dataset1, old = c("id", "latitude", "longitude"), new = c("id_dataset1", "latitude_gps", "longitude_gps"))
# 2) we only select the variables we need from dataset 2
dataset2 <- setDT(dataset2)[,c("id", "name", "latitude", "longitude")]
setnames(dataset2, old = c("id", "latitude", "longitude"), new = c("id_dataset2", "latitude_feature", "longitude_feature"))
# 3) only keep subet of dataset2 that falls within dataset 1.
# There is no reason to check if features are close that already fall out of the GPS coordinates in the trip we want to check
# We do add a 0.01 point margin around it to be on the save side. Maybe a feature falls just out the GPS coordinates,
# but is still near to a GPS point
dataset2 <- dataset2[latitude_feature %between% (range(dataset1$latitude_gps) + c(-0.01, +0.01))
& longitude_feature %between% (range(dataset1$longitude_gps) + c(-0.01, +0.01)), ]
# 4) we cut the dataset2 into slices on the latitude dimension
# some trial and error is involved getting the right amount. if you add to many you get a large and redudant amount of empty values
# if you add to few you get you need to check too many GPS to feauture distances per slice
dataset2[, range2 := as.numeric(Hmisc::cut2(dataset2$latitude_feature, g=n.splits))]
# 5) calculate the ranges of the slices we just created
ranges <- dataset2[,list(Min=min(latitude_feature), Max= max(latitude_feature)), by=range2][order(range2)]
setnames(ranges, old = c("range2", "Min", "Max"), new = c("latitude_range", "start", "end"))
# 6) now we assign too which slice every GPS coordinate in our dataset1 belongs
# this is super fast when using data.table grammar
elements1 <- dataset1$latitude_gps
ranges <- setDT(ranges)[data.table(elements1), on = .(start <= elements1, end >=elements1)]
ranges[, rowID := seq_len(.N)]
dataset1[,rowID := seq_len(.N)]
setkey(dataset1, rowID)
setkey(ranges, rowID)
dataset1<-dataset1[ranges]
# 7) this is the actual function we use to check if a datapoint is nearby.
# potentially there are faster function to do this??
checkdatapoint <- function(p, h, dist=desired.dist) {
distances <- spDistsN1(data.matrix(filter(dataset1,latitude_range==h)[,c("longitude_gps","latitude_gps")]),
p,
longlat=TRUE) # in km
return(which(distances <= dist)) # distance is now set to 200 meters
}
# 8) we assign a ID to the dataset1 starting again at every slice.
# we need this to later match the data again
dataset1[, ID2 := sequence(.N), by = latitude_range]
# 9) here we loop over all the splits and for every point check if there is a feature nearby in the slice it falls in
# to be on the save side we also check the slice left and right of it, just to make sure we do not miss features that
# are nearby, but just fall in a different slice.
# 9a: create an empty list we fill with dataframes later
TT<-vector("list", length=n.splits)
# 9b: loop over the number of slices using above defined function
for(i in 1:n.splits){
datapoints.near.feature<-apply(data.matrix(dataset2[range2 %in% c(i-1,i, i+1), c("longitude_feature","latitude_feature")]), 1, checkdatapoint, h=i)
# 9c: if in that slice there was no match between a GPS coordinate and an nearby feature, we create an empty list input
if(class(datapoints.near.feature)=="integer"|class(datapoints.near.feature)=="matrix"){
TT[[i]] <-NULL
} else {
# 9d: if there was a match we get a list of data point that are named
names(datapoints.near.feature) <- dataset2[range2 %in% c(i-1,i, i+1), name]
# 9e: then we 'melt' this list into data.frame
temp <- melt(datapoints.near.feature)
# 9f: then we transform it into a data.table and change the names
setDT(temp)
setnames(temp, old=c("value", "L1"), new= c("value", "feature_name"))
# 9h: then we only select the data point in dataset1 that fall in the current slice give them an
# ID and merge them with the file of nearby busstops
gpsdata.f <- dataset1[latitude_range==i, ]
gpsdata.f[, rowID2 := seq_len(.N)]
setkey(gpsdata.f, key = "rowID2")
setkey(temp, key = "value")
GPS.joined.temp <- merge(x = gpsdata.f, y = temp, by.x= "rowID2", by.y= "value", all.x=TRUE)
# 9i: we only keep the unique entries and for every slice save them to the list
GPS.joined.unique.temp <- unique(GPS.joined.temp, by=c("id_dataset1", "feature_name"))
TT[[i]] <- GPS.joined.unique.temp
cat(paste0(round(i/n.splits*100), '% completed'), " \r"); flush.console()
#cat(i/n.splits*100, " \r"); flush.console()
}
}
# 10) now we left join the original dataset and and the data point that are near a feature
finallist<- merge(x = dataset1,
y = rbindlist(TT[vapply(TT, Negate(is.null), NA)]),
by.x= "id_dataset1",
by.y= "id_dataset1",
all.x=TRUE)
# 11) we add a new logical variable to check if any bus stop is near
finallist[, nearby := TRUE][is.na(feature_name), nearby := FALSE] # add a dummy to check if any bus stop is nearby.
# 12) if a point is near multiple features at once these are listed in a vector,
# instead of having duplicate rows with teh same id but different features
finallist <- unique(setDT(finallist)[order(id_dataset1, feature_name), list(feature_name=list(feature_name), id=id_dataset1, lat=latitude_gps.x, lon=longitude_gps.x, nearby=nearby), by=id_dataset1], by="id_dataset1")
return(finallist)
}

Difficulty combining lists, characters, and numbers into data frame

I'm lost on how to combine my data into a usable data frame. I have a list of lists of character and number vectors Here is a working example of my code so far:
remove(list=ls())
# Headers for each of my column names
headers <- c("name","p","c","prophylaxis","control","inclusion","exclusion","conversion excluded","infection criteria","age criteria","mean age","age sd")
#_name = author and year
#_p = no. in experimental arm.
#_c = no. in control arm
#_abx = antibiotic used
#_con = control used
#_inc = inclusion criteria
#_exc = exclusion criteria
#_coexc = was conversion to open excluded?
#_infxn = infection criteria
#_agecrit = age criteria
#_agemean = mean age of study
#_agesd = sd age of study
# Passos 2016
passos_name <- c("Passos","2016")
passos_p <- 50
passos_c <- 50
passos_abx <- "cefazolin 1g at induction"
passos_con <- "none"
passos_inc <- c("elective LC","symptomatic cholelithiasis","low risk")
passos_exc <- c("renal impairment","hepatic impairment","immunosuppression","regular steroid use","antibiotics within 48H","acute cholecystitis","choledocolithiasis")
passos_coexc <- TRUE
passos_infxn <- c("temperature >37.8C","tachycardia","asthenia","local pain","local purulence")
passos_agecrit <- NULL
passos_agemean <- 48
passos_agesd <- 13.63
passos <- list(passos_name,passos_p,passos_c,passos_abx,passos_con,passos_inc,passos_exc,passos_coexc,passos_infxn,passos_agecrit,passos_agemean,passos_agesd)
names(passos) <- headers
# Darzi 2016
darzi_name <- c("Darzi","2016")
darzi_p <- 182
darzi_c <- 247
darzi_abx <- c("cefazolin 1g 30min prior to induction","cefazolin 1g 6H after induction","cefazolin 1g 12H after induction")
darzi_con <- "NaCl"
darzi_inc <- c("elective LC","first time abdominal surgery")
darzi_exc <- c("antibiotics within 7 days","immunosuppression","acute cholecystitis","choledocolithiasis","cholangitis","obstructive jaundice",
"pancreatitis","previous biliary tract surgery","previous ERCP","DM","massive intraoperative bleeding","antibiotic allergy","major thalassemia",
"empyema")
darzi_coexc <- TRUE
darzi_infxn <- c("temperature >38C","local purulence","intra-abdominal collection")
darzi_agecrit <- c(">18", "<75")
darzi_agemean <- 43.75
darzi_agesd <- 13.30
darzi <- list(darzi_name,darzi_p,darzi_c,darzi_abx,darzi_con,darzi_inc,darzi_exc,darzi_coexc,darzi_infxn,darzi_agecrit,darzi_agemean,darzi_agesd)
names(darzi) <- headers
# Matsui 2014
matsui_name <- c("Matsui","2014")
matsui_p <- 504
matsui_c <- 505
matsui_abx <- c("cefazolin 1g at induction","cefazolin 1g 12H after induction","cefazolin 1g 24H after induction")
matsui_con <- "none"
matsui_inc <- "elective LC"
matsui_exc <- c("emergent","concurrent surgery","regular insulin use","regular steroid use","antibiotic allergy","HD","antibiotics within 7 days","hepatic impairment","chemotherapy")
matsui_coexc <- FALSE
matsui_infxn <- c("local purulence","intra-abdominal collection","distant infection","temperature >38C")
matsui_agecrit <- ">18"
matsui_agemean <- NULL
matsui_agesd <- NULL
matsui <- list(matsui_name,matsui_p,matsui_c,matsui_abx,matsui_con,matsui_inc,matsui_exc,matsui_coexc,matsui_infxn,matsui_agecrit,matsui_agemean,matsui_agesd)
names(matsui) <- headers
# Find unique exclusion critieria in order to create the list of all possible levels
exc <- ls()[grepl("_exc",ls())]
exclist <- sapply(exc,get)
exc.levels <- unique(unlist(exclist,use.names = F))
# Find unique inclusion critieria in order to create the list of all possible levels
inc <- ls()[grepl("_inc",ls())]
inclist <- sapply(inc,get)
inc.levels <- unique(unlist(inclist,use.names = F))
# Find unique antibiotics order to create the list of all possible levels
abx <- ls()[grepl("_abx",ls())]
abxlist <- sapply(abx,get)
abx.levels <- unique(unlist(abxlist,use.names = F))
# Find unique controls in order to create the list of all possible levels
con <- ls()[grepl("_con",ls())]
conlist <- sapply(con,get)
con.levels <- unique(unlist(conlist,use.names = F))
# Find unique age critieria in order to create the list of all possible levels
agecrit <- ls()[grepl("_agecrit",ls())]
agecritlist <- sapply(agecrit,get)
agecrit.levels <- unique(unlist(agecritlist,use.names = F))
I have been struggling with:
1) Turn each of the _exc, _inc, _abx, _con, _agecrit lists into factors using the levels generated at the end of the code block. I have been trying to use a for loop such as:
for (x in exc) {
as.name(x) <- factor(get(x),levels = exc.levels)
}
This only creates a variable, x, that stores the last parsed list as a factor.
2) Combine all of my data into a data frame formatted as such:
name, p, c, prophylaxis, control, inclusion, exclusion, conversion excluded, infection criteria, age criteria, mean age, age sd
"Passos 2016", 50, 50, "cefazolin 1g at induction", "none", ["elective LC","symptomatic cholelithiasis","low risk"], ["renal impairment","hepatic impairment","immunosuppression","regular steroid use","antibiotics within 48H","acute cholecystitis","choledocolithiasis"], TRUE, ["temperature >37.8C","tachycardia","asthenia","local pain","local purulence"], NULL, 48, 13.63
...
# [] = factors
# columns correspond to each studies variables (i.e. passos_name, passos_p, passos_c, etc..)
# rows correspond to each study (i.e., passos, darzi, matsui)
I have tried various solutions on StackOverflow, but have not found any that work; for example:
studies <- list(passos,darzi,matsui,ruangsin,turk,naqvi,hassan,sharma,uludag,yildiz,kuthe,koc,maha,tocchi,higgins,mahmoud,kumar)
library(data.table)
rbindlist(lapply(studies,as.data.frame.list))
I suspect my data may not be exactly amenable to a data frame? Primarily because of trying to store a list of factors in a column. Is that allowed? If not, how is this type of data normally stored? My goal is to be able to meaningfully compare these various criterion across studies.
This is too long for a comment, so I turn it into an "answer":
To start with, have a look at what happens here:
data.frame(name = "Passos, 2016", p = 50)
name p
1 Passos, 2016 50
data.frame(name = c("Passos", "2016"), p = 50)
name p
1 Passos 50
2 2016 50
In the first one, we created a dataframe with the column "name" which contained one entry "Passos, 2016", i.e. one character containing both pieces of information, and the column "p". All fine. Now, in the second version, I specified the column "name" as you did above, using c(Passos, 2016). This is a two-element vector, and hence we get two rows in the dataframe: one with name Passos, one with name 2016, and the column p gets recycled.
Clearly, the latter is probably not what you intended. But it works anyway because R just recycles the shorter vector. Now, what do you think happens if I add a vector that contains three elements?
And this highlights the main issue with what you are doing: you are trying to get a dataframe from many vectors with different lengths. Now, in some cases this is fine if you want the shorter vector to be repeated (in R speech, we call this "recycled"), but it does not look like something you want to do here.
So, my recommendation would be this: try to imagine a matrix and make sure you understand what each element (row and column) is supposed to be. Then specify your data accordingly. If in doubt, look up "tidy data".

How to dissolve polygon in R and transferring original attribute information

I want to dissolve some polygons, and I am doing the following:
Batching in the shapefile (DA.shp - sensitive information hence first two sample records and only first three columns shown)
Batching in csv file called zone.csv that has the information for dissolving joining the zone.csv to DA (first five records shown due to sensitivity)
Dissolving the joined shapefile
Creating row IDs to make the dissolved shapefile into a polygondataframe for export.
It all goes smoothly, however, I want to carry the Zed and Criteria fields in my dissolved polygon, like one can using GIS. I have tried to search in vain, so any help will be appreciated.
library(rgeos)
library(rgdal)
library(sp)
# set working directory
wd <- setwd("c:/Personal/R")
# read DA shapefile
da <- readOGR(wd, "DA")
plot(da)
crs.shp <- proj4string(da)
da#data[1:2,1:3] # check first two records
OBJECTID DAUID CDUID
0 3 35204831 3520
1 5 35180720 3518
# batchin text file with zone numbers
zones.csv <- read.csv(file="c:/personal/R/Variant1.csv", header=TRUE, sep=",")
zones.csv$DAUID <- as.character(zones.csv$DAUID) # make DAUID as character for join
zones.csv[1:5,]
DAUID zed Criteria
1 35140110 3102 GGHM zones
2 35140111 3102 GGHM zones
3 35140112 3102 GGHM zones
4 35140113 3102 GGHM zones
5 35140114 3102 GGHM zones
da1 <- da # save a copy
da1#data$DAUID <- as.character(da1#data$DAUID) # make character field for join
da1#data <- merge(da1#data, zones.csv, by.x = "DAUID", by.y = "DAUID", all.x=T, sort=F)
# Now dissolve
zone.shp <- gUnaryUnion(da1, id = da1#data$zed.x)
plot(zone.shp)
# extract zone Id's to make dataframe
Gid <- sapply(slot(zone.shp, "polygons"), function(x) slot(x, "ID"))
# Create dataframe with correct rownames
z.df <- data.frame( ID=1:length(zone.shp), row.names = Gid)
# make Polygondataframe to export as shapefile
zone.shp.pdf <- SpatialPolygonsDataFrame(zone.shp, data=z.df)
zone.shp.pdf#data$crit <-
proj4string(zone.shp.pdf) <- CRS(proj4string(da))
Here is a self-contained reproducible example with some SpatialPolygons:
libarary(raster)
p <- shapefile(system.file("external/lux.shp", package="raster"))
Create a data.frame, and so on.
Anyway, I think you can use raster::aggregate to solve your problem. Below is a simplified and improved script, but I cannot check if it works as I do not have your data.
library(raster)
da <- shapefile("c:/Personal/DA.shp")
zones <- read.csv("c:/personal/R/Variant1.csv", stringsAsFactors=FALSE)
da1 <- merge(da, zones, by="DAUID", all.x=TRUE)
# Now dissolve
zone.shp <- aggregate(da1, c('zed', 'Criteria'))
If you want to write this to a shapefile:
shapefile(zone.shp, 'file.shp')

Resources