I am attempting to run a clustering algorithm over a list of dissimilarity matrices for different numbers of clusters k and extract some information for each run.
This first block of code produces the list of dissimilarity matrices
library(tidyverse)
library(cluster)
library(rje)
dat=mtcars[,1:3]
v_names=names(dat)
combos=rje::powerSet(v_names)
combos=combos[lengths(combos)>1]
df_list=list()
for (i in seq_along(combos)){
df_list[[i]]=dat[combos[[i]]]
}
gower_ls=lapply(df_list,daisy,metric="gower")
Here is the section of code I am having a problem with
set.seed(4)
model_num <-c(NA)
sil_width <-c(NA)
min_sil<-c(NA)
mincluster<-c(NA)
k_clusters <-c(NA)
lowest_sil <-c(NA)
maxcluster <-c(NA)
model_vars <- c(NA)
clust_4=lapply(gower_ls,pam,diss=TRUE,k=4)
for(m in 1:length(clust_4)){
sil_width[m] <-clust_4[[m]][7]$silinfo$avg.width
min_sil[m] <- min(clust_4[[m]][7]$silinfo$clus.avg.widths)
mincluster[m] <-min(clust_4[[m]][6]$clusinfo[,1])
maxcluster[m] <-max(clust_4[[m]][6]$clusinfo[,1])
k_clusters[m]<- nrow(clust_4[[m]][6]$clusinfo)
lowest_sil[m]<-min(clust_4[[m]][7]$silinfo$widths)
model_num[m] <-m
}
colresults_4=as.data.frame(cbind( sil_width, min_sil,mincluster,maxcluster,k_clusters,model_num,lowest_sil))
How can I convert this piece of code to run for a given range of k? I've tried a nested loop but I was not able to code it correctly. Here are the desired results for k= 4:6, thanks.
structure(list(sil_width = c(0.766467312788453, 0.543226669407726,
0.765018469447229, 0.705326458357873, 0.698351173575526, 0.480565022092276,
0.753366365875066, 0.644345251543097, 0.699437672202048, 0.430310752506775,
0.678224885117295, 0.576411380463116), min_sil = c(0.539324315243191,
0.508330909368204, 0.637090842537915, 0.622120627356455, 0.539324315243191,
0.334047777245833, 0.430814518122641, 0.568591550281139, 0.539324315243191,
0.295113900268025, 0.430814518122641, 0.19040716086259), mincluster = c(5,
3, 4, 5, 2, 3, 3, 3, 2, 3, 3, 3), maxcluster = c(14, 12, 11,
14, 12, 10, 11, 11, 9, 6, 7, 7), k_clusters = c(4, 4, 4, 4, 5,
5, 5, 5, 6, 6, 6, 6), model_num = c(1, 2, 3, 4, 1, 2, 3, 4, 1,
2, 3, 4), lowest_sil = c(-0.0726256983240229, 0.0367238314801671,
0.308069836672298, 0.294247157041013, -0.0726256983240229, -0.122804288130541,
-0.317748917748917, 0.218164082936686, -0.0726256983240229, -0.224849074123824,
-0.317748917748917, -0.459909237820881)), row.names = c(NA, -12L
), class = "data.frame")
I was able to come up with a solution by writing a function clus_func that extracts the cluster information and then using cross2 and map2 from the purrr package:
library(tidyverse)
library(cluster)
library(rje)
dat=mtcars[,1:3]
v_names=names(dat)
combos=rje::powerSet(v_names)
combos=combos[lengths(combos)>1]
clus_func=function(x,k){
clust=pam(x,k,diss=TRUE)
clust_stats=as.data.frame(cbind(
avg_sil_width=clust$silinfo$avg.width,
min_clus_width=min(clust$silinfo$clus.avg.widths),
min_individual_sil=min(clust$silinfo$widths[,3]),
max_individual_sil=max(clust$silinfo$widths[,3]),
mincluster= min(clust$clusinfo[,1]),
maxcluster= max(clust$clusinfo[,1]),
num_k=max(clust$clustering) ))
}
df_list=list()
for (i in seq_along(combos)){
df_list[[i]]=dat[combos[[i]]]
}
gower_ls=lapply(df_list,daisy,metric="gower")
begin_k=4
end_k=6
cross_list=cross2(gower_ls,begin_k:end_k)
k=c(NA)
for(i in 1:length(cross_list)){ k[i]=cross_list[[i]][2]}
diss=c(NA)
for(i in 1:length(cross_list)){ diss[i]=cross_list[[i]][1]}
model_stats=map2(diss, k, clus_func)
model_stats=rbindlist(model_stats)
Related
I have five data frames with the same dimension (8 by 2) but with different column names as follows:
nbb <- data.frame(
nbb_lb = c(2, 3, 4, 5, 6, 7, 8, 9),
nbb_RMSE = c(1.0152338, 0.7199394, 0.7990978, 0.9045563, 1.6514406, 0.5160516, 0.4964024, 0.2617795)
)
mbb <- data.frame(
mbb_lb = c(2, 3, 4, 5, 6, 7, 8, 9),
mbb_RMSE = c(0.8324074, 0.9278236, 1.9817984, 0.9567368, 0.2814623, 0.1129459, 0.1233126, 0.4222578)
)
cbb <- data.frame(
cbb_lb = c(2, 3, 4, 5, 6, 7, 8, 9),
cbb_RMSE = c(1.27782499, 1.96332220, 0.74704997, 0.46579943, 1.10850563, 0.40456698, 0.26027359, 0.02452239)
)
tmbb <- data.frame(
tmbb_lb = c(2, 3, 4, 5, 6, 7, 8, 9),
tmbb_RMSE = c(0.83240742, 1.05126826, 0.08290467, 0.76397988, 1.23772208, 0.57628337, 0.56437185, 0.46460279)
)
tcbb <- data.frame(
tcbb_lb = c(2, 3, 4, 5, 6, 7, 8, 9),
tcbb_RMSE = c(0.9328510, 0.8312332, 0.9402116, 1.6029357, 2.0001519, 0.4387557, 0.5965582, 0.4148854)
)
I want to create a new data frame that will contain each row that contains minimum RMSE in the five(5) data frames above. If I make the row names to be the name of different data frames as (nbb, mbb, cbb, tmbb, tcbb)
rownames(df) <- c("nbb", "mbb", "cbb", "tmbb", "tcbb")
I desire to have a result like the below:
df
lb
RMSE
nbb
9
0.2617795
mbb
7
0.1129459
cbb
9
0.02452239
tmbb
4
0.08290467
tcbb
8
0.4387557
Here is a base R way.
First put the data.frames in a list with mget, then lapply the list an anonymous function outputting the minimum RMSE row. Bind the output rows and reorder according to the wanted row names.
nbb <- data.frame(
nbb_lb = c(2, 3, 4, 5, 6, 7, 8, 9),
nbb_RMSE = c(1.0152338, 0.7199394, 0.7990978, 0.9045563, 1.6514406, 0.5160516, 0.4964024, 0.2617795)
)
mbb <- data.frame(
mbb_lb = c(2, 3, 4, 5, 6, 7, 8, 9),
mbb_RMSE = c(0.8324074, 0.9278236, 1.9817984, 0.9567368, 0.2814623, 0.1129459, 0.1233126, 0.4222578)
)
cbb <- data.frame(
cbb_lb = c(2, 3, 4, 5, 6, 7, 8, 9),
cbb_RMSE = c(1.27782499, 1.96332220, 0.74704997, 0.46579943, 1.10850563, 0.40456698, 0.26027359, 0.02452239)
)
tmbb <- data.frame(
tmbb_lb = c(2, 3, 4, 5, 6, 7, 8, 9),
tmbb_RMSE = c(0.83240742, 1.05126826, 0.08290467, 0.76397988, 1.23772208, 0.57628337, 0.56437185, 0.46460279)
)
tcbb <- data.frame(
tcbb_lb = c(2, 3, 4, 5, 6, 7, 8, 9),
tcbb_RMSE = c(0.9328510, 0.8312332, 0.9402116, 1.6029357, 2.0001519, 0.4387557, 0.5965582, 0.4148854)
)
df_list <- mget(ls(pattern = "bb$"))
tmp <- lapply(df_list, \(x){
i <- which.min(x[[2]])
if(length(i) > 0L) {
data.frame(lb = x[i, 1], RMSE = x[i, 2])
} else NULL
})
res <- do.call(rbind, tmp)
rm(tmp)
res <- cbind.data.frame(df = names(df_list), res)
i <- order(c("nbb", "mbb", "cbb", "tmbb", "tcbb"))
res <- res[i,]
res
#> df lb RMSE
#> nbb nbb 9 0.26177950
#> mbb mbb 7 0.11294590
#> cbb cbb 9 0.02452239
#> tmbb tmbb 4 0.08290467
#> tcbb tcbb 9 0.41488540
Created on 2022-04-10 by the reprex package (v2.0.1)
First combine your 5 data frames into a list, then use lapply to go through all your data frames and output the necessary information. Also, wrap the lapply() with a do.call to row bind (rbind) the results. Finally, change the result into a data frame and covert the rownames to column names
You can skip rownames_to_column("df") if you want to have row names instead of a column storing the dataframe names.
library(tibble)
df_list <- list(nbb = nbb, mbb = mbb, cbb = cbb, tmbb = tmbb, tcbb = tcbb)
do.call(rbind, lapply(df_list, function(x) data.frame(lb = x[which.min(x[,2]), 1], RMSE = min(x[, 2])))) %>%
rownames_to_column("df")
df lb RMSE
1 nbb 9 0.26177950
2 mbb 7 0.11294590
3 cbb 9 0.02452239
4 tmbb 4 0.08290467
5 tcbb 9 0.41488540
Input data
Moreover, note that the way you define dataframe in your post is not correct, I've included the "correct" way here for reference.
nbb <- data.frame(
nbb_lb = c(2, 3, 4, 5, 6, 7, 8, 9),
nbb_RMSE = c(1.0152338, 0.7199394, 0.7990978, 0.9045563, 1.6514406, 0.5160516, 0.4964024, 0.2617795))
mbb <- data.frame( mbb_lb = c(2, 3, 4, 5, 6, 7, 8, 9),
mbb_RMSE = c(0.8324074, 0.9278236, 1.9817984, 0.9567368, 0.2814623, 0.1129459, 0.1233126, 0.4222578)
)
cbb <- data.frame(
cbb_lb = c(2, 3, 4, 5, 6, 7, 8, 9),
cbb_RMSE = c(1.27782499, 1.96332220, 0.74704997, 0.46579943, 1.10850563, 0.40456698, 0.26027359, 0.02452239)
)
tmbb <- data.frame(
tmbb_lb = c(2, 3, 4, 5, 6, 7, 8, 9),
tmbb_RMSE = c(0.83240742, 1.05126826, 0.08290467, 0.76397988, 1.23772208, 0.57628337, 0.56437185, 0.46460279))
tcbb <- data.frame(
tcbb_lb = c(2, 3, 4, 5, 6, 7, 8, 9),
tcbb_RMSE = c(0.9328510, 0.8312332, 0.9402116, 1.6029357, 2.0001519, 0.4387557, 0.5965582, 0.4148854)
)
If you are open to a purrr approach, you could use
library(purrr)
library(dplyr)
my_list %>%
map_dfr(~.x %>%
filter(if_any(ends_with("_RMSE"), ~.x == min(.x))) %>%
rename_with(~gsub(".*_", "", .x)),
.id = "df") %>%
bind_rows()
this returns
df lb RMSE
1 cbb 9 0.02452239
2 mbb 7 0.11294590
3 nbb 9 0.26177950
4 tcbb 9 0.41488540
5 tmbb 4 0.08290467
with (borrowed from Rui Barradas)
my_list <- mget(ls(pattern = "bb$"))
I've got a batch of survey data that I'd like to be able to subset on a few specific columns which have 0-10 scale data (e.g. Rank your attitude towards x as 0 to 10) so that I can plot using using ggplot() + facet_grid. Faceting will be using 3 hi/med/low bins calculated as +1 / -1 standard deviation above the mean. I have working code, which splits the overall dataframe into 3 parts like so:
# Generate sample data:
structure(list(Q4 = c(2, 3, 3, 5, 4, 3), Q5 = c(1, 3, 3, 3, 2,
2), Q6 = c(4, 3, 3, 3, 4, 4), Q7 = c(4, 2, 3, 5, 5, 5), Q53_1 = c(5,
8, 4, 5, 4, 5)), row.names = c(NA, -6L), class = c("tbl_df",
"tbl", "data.frame"))
# Aquire Q53_1 data as factors
political_scale <- factor(climate_experience_data$Q53_1, levels = c(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10))
# Generate thresholds based on mean and standard deviation thresholds
low_threshold <- round(mean(as.numeric(political_scale, na.rm = T)) - sd(as.numeric(political_scale)), digits = 0)
high_threshold <- round(mean(as.numeric(political_scale, na.rm = T)) + sd(as.numeric(political_scale)), digits = 0)
# Generate low/med/high bins based on Mean and SD
political_lr_low <- filter(climate_experience_data, Q53_1 <= low_threshold)
political_lr_mid <- filter(climate_experience_data, Q53_1 < high_threshold & Q53_1 > low_threshold)
political_lr_high <- filter(climate_experience_data, Q53_1 >= high_threshold)
What I've realised is that this approach really doesn't lend itself to faceting. What I suspect is that I need to use a combination of mutate() across() where() and group_by() to add data to a new column Q53_scale with "hi" "med" "low" based on where Q53_1 values fall in relation to those low/high thresholds (e.g. SD +1 over mean and -1 under mean). My first few dozen attempts have fallen short - has anyone managed to use sd() to bin data for faceting in this way?
library(tidyverse)
climate_experience_data <- structure(list(Q4 = c(2, 3, 3, 5, 4, 3), Q5 = c(
1, 3, 3, 3, 2,
2
), Q6 = c(4, 3, 3, 3, 4, 4), Q7 = c(4, 2, 3, 5, 5, 5), Q53_1 = c(
5,
8, 4, 5, 4, 5
)), row.names = c(NA, -6L), class = c(
"tbl_df",
"tbl", "data.frame"
))
climate_experience_data %>%
mutate(
bin = case_when(
Q53_1 > mean(Q53_1) + sd(Q53_1) ~ "high",
Q53_1 < mean(Q53_1) - sd(Q53_1) ~ "low",
TRUE ~ "medium"
) %>% factor(levels = c("low", "medium", "high"))
) %>%
ggplot(aes(Q4, Q5)) +
geom_point() +
facet_grid(~bin)
Created on 2022-03-10 by the reprex package (v2.0.0)
Salut folks! I'm still quiet new to ggplot and trying to understand, but I really need some help here.
Edit: Reproducible Data of my Dataset "Daten_ohne_Cluster_NA", first 25 rows
structure(list(ntaxa = c(2, 2, 2, 2, 2, 2, 2, 5, 5, 5, 5, 5,
6, 6, 6, 6, 6, 5, 8, 8, 7, 7, 6, 5, 5), mpd.obs.z = c(-1.779004391,
-1.721014957, -1.77727283, -1.774642404, -1.789386039, -1.983401439,
-0.875426386, -2.276052068, -2.340365105, -2.203126078, -2.394158227,
-2.278173635, -1.269075471, -1.176760985, -1.313045215, -1.164289676,
-1.247549961, -0.868174033, -2.057106804, -2.03154772, -1.691850922,
-1.224391713, -0.93993654, -0.39315089, -0.418380361), mntd.obs.z = c(-1.759874454,
-1.855202792, -1.866281778, -1.798439855, -1.739998395, -1.890847575,
-0.920672112, -1.381541177, -1.382847758, -1.394870597, -1.339878669,
-1.349541665, -0.516793786, -0.525476292, -0.557425575, -0.539534996,
-0.521299478, -0.638951825, -1.06467985, -1.033009266, -0.758380203,
-0.572401837, -0.166616844, 0.399510209, 0.314591018), pe = c(0.046370234,
0.046370234, 0.046370234, 0.046370234, 0.046370234, 0.046370234,
0.071665745, 0.118619482, 0.118619482, 0.118619482, 0.118619482,
0.118619482, 0.205838414, 0.205838414, 0.205838414, 0.205838414,
0.205838414, 0.179091659, 0.215719118, 0.215719118, 0.212092271,
0.315391478, 0.312205596, 0.305510773, 0.305510773), ECO_NUM = c(1,
6, 6, 1, 7, 6, 6, 6, 6, 6, 6, 7, 7, 6, 1, 6, 6, 6, 6, 6, 6, 7,
7, 7, 6)), row.names = c(NA, -25L), class = c("tbl_df", "tbl",
"data.frame"))
(1) I prepared my Dataframe like this:
'Daten_Cluster <- Daten[, c("ntaxa", "mpd.obs.z", "mntd.obs.z", "pe", "ECO_NUM")]
(2) I threw out all the NA's with na.omit. It is 6 variables with 3811 objects each. The column ECO_NUM represents the different ecoregions as a kategorial, numerical factor.
(3) Then I did a Cluster Analysis with k.means. I used 31 groups as there are 31 ecoregions in my dataset and the aim is to colour the plot after ecoregions lateron.
'Biomes_Clus <- kmeans(Daten_Cluster_ohne_NA, 31, iter.max = 10, nstart = 25)
(4) Then I followed the online-instructions from datanovia.com on how to visualise a k.means cluster analysis (I always just follow these How-To
s as I have no idea how to do it all by myself). I tried to change the arguments accordingly to colour after ecoregions.
fviz_cluster(Biomes_Clus, data = Daten_Cluster_ohne_NA,
geom = "point",
ellipse.type = "convex",
ggtheme = theme_bw(),
) +
stat_mean(aes(color = Daten_Cluster_ohne_NA$ECO_NUM), size = 4)
I get more than 50 warnings here, I guess for each object. Saying: In grid.Call.graphics(C_points, x$x, x$y, x$pch, x$size) : unimplemented pch value '30'
I know that there are not enough pch-symbols for 31 groups, but I also don't need them - I just would like to have it coloured.
I also tried out the other function ggscatter and created my own color-palette (called P36) with more than 31 colours to have enough colours for the ecoregions.
ggscatter(
ind.coord, x = "Dim.1", y = "Dim.2",
color = "Species", palette = "P36", ellipse = TRUE, ellipse.type = "convex",
legend = "right", ggtheme = theme_bw(),
xlab = paste0("Dim 1 (", variance.percent[1], "% )" ),
ylab = paste0("Dim 2 (", variance.percent[2], "% )" )
) +
stat_mean(aes(color = cluster), size = 4)
The Error here is that a Discrete value was supplied to continuous scale. THe Question is: How can I easily colour the outcome of my k.means (which worked) and colour it not by the newly clustered groups but by the ecoregions (to visualise if there is a difference between the clusters and the ecoregion-groups)?
I appreciate your help and me and my group partner would be very thankful!! :)
Greetings
Evelyn
Keep in mind, I am very new to R.
I have a dataset from a public opinion survey, and would like to represent the answers through a bubble chart, though the data is categorical, not numeric.
From dataset "Arab4" I have question/variable "Q713" with all of the observations coded as 1, 2, 3, 4, or 5 as the response options. I would like to plot the bubbles (stacked on top of one another by "country") with the size of the bubble corresponding to the percent of the vote share that answer got. For example, if 49% of respondents in Israel voted for option 1 under question "Q", then the bubble size would represent 49% and be situated above the Israel category label with the color of the bubble corresponding to the response type (1, 2, 3, 4, or 5).
I have the following code, giving me a blank chart, and I know to eventually use the "points" command with more specifications.
What I need help with is defining the radius of the circles from the data I have.
plot(Arab4$Country, Arab4$Q713, type= "n", xlab = FALSE, ylab=FALSE)
points(Arab4$country, Arab4$q713)
Here is some dput from the data set
dput(Arab4$q713[1:50])
structure(c(3, 5, 3, 3, 1, 3, 5, 5, 5, 5, 3, 2, 2, 3, 1, 1, 4,
2, 3, 5, 5, 5, 2, 5, 4, 2, 5, 2, 5, 3, 5, 5, 2, 2, 5, 2, 1, 2,
1, 2, 5, 3, 4, 5, 1, 1, 1, 4, 5, 3), labels = structure(c(1,
2, 3, 4, 5, 98, 99), .Names = c("Promoting democracy", "Promoting economic
development",
"Resolving the Arab-Israeli conflict", "Promoting women’s rights",
"The US should not get involved", "Don't know (Do not read)",
"Decline to answer (Do not read)")), class = "labelled")
Any ideas would help! Thanks!
As others have commented, this really is not a bubble chart as you only have 2 dimensions and the size of the circle does not add anything (other than perhaps visual appeal). But with that disclaimer, here is one approach to what I think you are trying to achieve. This requires the ggplot2 and reshape2 libraries.
library(ggplot2)
library(reshape2)
# create simulated data
dat <- data.frame(Egypt=sample(c(1:5), 20), Libya=sample(c(1:5),20))
# tabulate
dat.tab <- apply(dat, 2, table)
dat.long <- melt(dat.tab)
colnames(dat.long) <- c("Response", "Count", "Country")
ggplot(dat.long, aes(x=Country, y=Count, color=Country)) +
geom_point(aes(size=Count))
EDIT Here is another approach, using the data manipulation tools of the dplyr package to get you all the way to proportions:
# using dat from above again
dat.long <- melt(dat)
colnames(dat.long) <- c("Country", "Response")
dat.tab <- dat.long %>%
group_by(Country) %>%
count(Response) %>%
mutate(prop = prop.table(n))
ggplot(dat.tab, aes(x=Country, y=prop, color=Country)) +
geom_point(aes(size=prop))
You will need to do a little additional work to remove unwanted values (98, 99) if they are truly unwanted.
hth.
Total R-newbie, here. Please be gentle.
I have a column in a dataframe with numerical values representing ethnicity (UK Census data).
# create example data
id = c(1, 2, 3, 4, 5, 6, 7, 8, 9)
ethnicode = c(0, 1, 2, 3, 4, 5, 6, 7, 8)
df = data.frame(id, ethnicode)
I can do a mapping (or find/replace) to create a column (or edit an existing column) that contains a human-readable value:
# map values one-to-one from numeric to string
df$ethnicity <- mapvalues(df$ethnicode,
from = c(8, 7, 6, 5, 4, 3, 2, 1, 0),
to = c("Other", "Black", "Asian", "Mixed",
"WhiteOther", "WhiteIrish", "WhiteUK",
"WhiteTotal", "All"))
Of all of the things I tried this seemed to be the quickest (around 20 seconds for 9 million rows as opposed to over a minute with some approaches).
What I can’t seem to find (or understand from what I’ve read), is how to reference a lookup table instead.
# create lookup table
ethnicode = c(8, 7, 6, 5, 4, 3, 2, 1, 0)
ethnicity = c(("Other", "Black", "Asian", "Mixed", "WhiteOther",
"WhiteIrish", "WhiteUK", "WhiteTotal", "All")
lookup = data.frame(ethnicode, ethnicity)
The point being, if I want to change the human readable strings, or do anything else to the process, I’d rather do it once to the look-up table, than have to do it in several places in several scripts... and if I can do it more efficiently (under 20 seconds for 9 million rows) that would be good, too.
I also want to easily make sure that “8” still equals ‘Other’ (or whatever equivalent), and “0” still equals ‘All’, etc., which is more difficult, visually, with longer lists using the above approach.
Thanks in advance.
You could use named vectors for this. However, you would need to convert the ethnicode to character.
df = data.frame(
id = c(1, 2, 3, 4, 5, 6, 7, 8, 9),
ethnicode = as.character(c(0, 1, 2, 3, 4, 5, 6, 7, 8)),
stringsAsFactors=FALSE
)
# create lookup table
ethnicode = c(8, 7, 6, 5, 4, 3, 2, 1, 0)
ethnicity = c("Other", "Black", "Asian", "Mixed", "WhiteOther",
"WhiteIrish", "WhiteUK", "WhiteTotal", "All")
lookup = setNames(ethnicity, as.character(ethnicode))
Then you can do
df <- transform(df, ethnicity=lookup[ethnicode], stringsAsFactors=FALSE)
and you are done.
For working with 9 million rows, I suggest you use a database like sqlite or monetdb. For sqlite, the following code might be helpful:
library(RSQLite)
dbname <- "big_data_mapping.db" # db to create
csvname <- "data/big_data_mapping.csv" # large dataset
ethn_codes = data.frame(
ethnicode= c(8, 7, 6, 5, 4, 3, 2, 1, 0),
ethnicity= c("Other", "Black", "Asian", "Mixed", "WhiteOther", "WhiteIrish", "WhiteUK", "WhiteTotal", "All")
)
# build db
con <- dbConnect(SQLite(), dbname)
dbWriteTable(con, name="main", value=csvname, overwrite=TRUE)
dbWriteTable(con, name="ethn_codes", ethn_codes, overwrite=TRUE)
# join the tables
dat <- dbGetQuery(con, "SELECT main.id, ethn_codes.ethnicity FROM main JOIN ethn_codes ON main.ethnicode=ethn_codes.ethnicode")
# finish
dbDisconnect(con)
#file.remove(dbname)
monetdb is said to be more suitable for the tasks you usually do with R, so it is definitly worth a look.