<- RSQLite::dbConnect(drv = RSQLite::SQLite(),"data/mwbugs.gpkg")
db_m source(paste0("https://tools.thewerg.unimelb.edu.au/data/mwbugs/",
"bug_database_functions.R"))
8 Examples of data usage
This chapter provides two examples of appraoches to using the database in R for common applications:
compilation of assemblage data from a subset of samples into taxon-by-sample matrix, a common data format for statistical analysis of assemblage composition;
extraction and mapping of occurrence data for a single taxon.
As described earlier, a simple approach to begin any analysis is to create a connection to the database and read all tables into the R environment.
Once you have access to the database, you need to create a connection to it to read its tables. Here is an example of creating a connection to the gpkg version of the database in R, having downloaded it to a subfolder ‘data’ of the working directory.
load_all_mwbugs_tables(db_m)
# Always remember to disconnect from database once data is accessed
::dbDisconnect(db_m) DBI
8.1 Example 1: compile a taxon-by-sample table
The following code and comments demonstrate the process of selecting a subset of the macroinvertebrate data using a range of criteria, lumping the taxonomic resolution to a uniform level (family) and producing a family-by-sample matrix (Table tbl-fam_by_samp) for analysis.
# Having inspected the sample provenance table [command View(spv)], I
# selected the 2021-2022 MW biological monitoring program: sourcecode 62
<- samples[samples$sourcecode == 62,] # 72 samples
samples_62 <- sites[sites$sitecode %in% samples_62$sitecode,] # sites sites_62
The sites table has mw_subcat and mw_cat fields that refer to Melbourne Water’s management units (which they call subcatchments and catchments). To access the key to the codes used in those fields we first need to access the mw_catchment table from the stream network database. Here I create a connection to the stream network database saved as a gpkg file in the same directory as the mwbugs database.
<- RSQLite::dbConnect(drv = RSQLite::SQLite(),"data/mwstr_v13.gpkg") db_str
<- sf::st_read(db_str, "mw_subcatchments")
mw_subcatchments # This is a spatial table, thus use st_read rather than dbReadTable
# Having inspected the mw_subcatchments [command View(mw_subcatchments)],
# I selected samples from the "Emu Creek" mw subcatchment
<- sites_62[sites_62$mw_subcat == 29,] # 4 sites
sites_62y <- samples_62[samples_62$sitecode %in%
samples_62y $sitecode,] # 8 samples
sites_62y# Extract the biological data from the 8 chosen samples
<- biota[biota$smpcode %in% samples_62y$smpcode,]
biota_62y # To check if all records are family level or higher
# sum(nchar(biota_62y$shortcode) > 4) # 3 shortcodes with >4 characters
# biota_62y[nchar(biota_62y$shortcode) > 4,] # they are records of the
# # ambiguous taxon Phreatoicidea (taxoncode OR999998) as explained in
# Best to conservatively lump this to "OR", which is done below
# If there were any other records at lower resolution than family, they
# could also be lumped to family by creating a famcode as:
$famcode <- biota_62y$taxoncode
biota_62y$famcode[nchar(biota_62y$famcode) > 4] <-
biota_62ysubstr(biota_62y$famcode,1,4)
$famcode[substr(biota_62y$famcode,3,4) == "99"] <-
biota_62ysubstr(biota_62y$famcode,1,2)[substr(biota_62y$famcode,3,4) == "99"]
# Create a family-by-sample matrix with count values
<- ct(rows = biota_62y$famcode, cols = biota_62y$smpcode,
fam_smp_count values = biota_62y$count)
# Make first column taxon names (Use taxon_all to look up famcodes, as
# the [as-supplied] taxon values in the biota table may be unreliable)
<- taxon_all$taxon[match(row.names(fam_smp_count),taxon_all$shortcode)]
ct_taxa <- cbind(data.frame(taxon = ct_taxa), fam_smp_count)
fam_smp_count ::flextable(fam_smp_count[c(1:3,8,17,61,62),]) flextable
taxon | 381-CHA-1268-2-1-AC | 381-CHA-1268-2-1-BC | 381-EMU-14206-0-1-AC | 381-EMU-14206-0-2-AC | 381-MAM-1366-8-1-AC | 381-MAM-1366-8-2-AC | 382-BOL-1267-8-1-AC | 382-BOL-1267-8-1-BC |
---|---|---|---|---|---|---|---|---|
Hydridae | 0 | 0.0 | 1 | 0 | 0.0 | 0 | 0 | 0 |
Temnocephalidae | 0 | 2.0 | 0 | 0 | 0.0 | 0 | 0 | 0 |
Dugesiidae | 0 | 0.0 | 6 | 0 | 0.0 | 0 | 1 | 1 |
Physidae | 21 | 2.0 | 28 | 20 | 0.0 | 0 | 0 | 0 |
Parastacidae | 0 | 0.1 | 0 | 0 | 0.1 | 1 | 0 | 0 |
Calocidae | 0 | 0.0 | 0 | 0 | 0.0 | 0 | 3 | 18 |
Leptoceridae | 1 | 0.0 | 42 | 27 | 5.0 | 4 | 3 | 0 |
The count data in the table only makes sense accounting for subsample size for these lab-sorted sub-sampled samples. To create an equivalent family-by-sample matrix (Table tbl-fam_by_samp_ss) with subsample proportion (not %) values:
$ss <- samples_62y$subsample_perc[match(biota_62y$smpcode,
biota_62y$smpcode)]*0.01
samples_62y# But note that coarsepick specimens are from whole samples,
# so ss should be 1 for them
$ss[biota_62y$coarsepick == 1] <- 1
biota_62y# And then make a matching family-by-sample matrix for ss
<- ct(rows = biota_62y$famcode, cols = biota_62y$smpcode,
fam_smp_ss values = biota_62y$ss)
<- cbind(data.frame(taxon = ct_taxa), fam_smp_ss)
fam_smp_ss ::flextable(fam_smp_ss[c(1:3,8,17,61,62),]) flextable
taxon | 381-CHA-1268-2-1-AC | 381-CHA-1268-2-1-BC | 381-EMU-14206-0-1-AC | 381-EMU-14206-0-2-AC | 381-MAM-1366-8-1-AC | 381-MAM-1366-8-2-AC | 382-BOL-1267-8-1-AC | 382-BOL-1267-8-1-BC |
---|---|---|---|---|---|---|---|---|
Hydridae | 0.00 | 0.0 | 0.1 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
Temnocephalidae | 0.00 | 0.1 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
Dugesiidae | 0.00 | 0.0 | 0.1 | 0.0 | 0.0 | 0.0 | 0.1 | 0.1 |
Physidae | 0.15 | 0.1 | 0.1 | 0.1 | 0.0 | 0.0 | 0.0 | 0.0 |
Parastacidae | 0.00 | 1.0 | 0.0 | 0.0 | 1.0 | 0.1 | 0.0 | 0.0 |
Calocidae | 0.00 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.1 | 0.1 |
Leptoceridae | 0.15 | 0.0 | 0.1 | 0.1 | 0.1 | 0.1 | 0.1 | 0.0 |
8.2 Example 2: A map of taxon occurrences
The following code extracts all presences and absences of a family of interest (Paramelitidae) and produce a map of their proportional occurrences, accounting for subsampling effort (Figure fig-op06_map).
# Find famcode for Paramelitidae
$taxon == "Paramelitidae",] #OP06 taxon_all[taxon_all
shortcode taxon table
1371 OP06 Paramelitidae taxon_fam
# Restrict search to lab-sorted rapid-bioassessment samples
<- samples[grepl("RBA",samples$collection_method) &
samps_rbaf grepl("lab",samples$processing_method),] #3925 samples
# Find samps_rbaf that contain a record of OP06
<- unique(biota$smpcode[biota$smpcode %in%
samps_rbaf_p $smpcode & substr(biota$taxoncode,1,4) == "OP06"])
samps_rbaf$OP06 <- 0
samps_rbaf$OP06[samps_rbaf$smpcode %in% samps_rbaf_p] <- 1
samps_rbaf# Estimate effective number of samples per site given subsampling effort
$ss <- samps_rbaf$subsample_perc/100
samps_rbaf$OP06 <- samps_rbaf$OP06* samps_rbaf$ss
samps_rbaf# Aggregate effective no. samples and effective no. occurrences by site
<- aggregate(samps_rbaf[c("ss","OP06")],
op06_by_site by = list(sitecode = samps_rbaf$sitecode),
FUN = sum, na.rm = TRUE) # NAs samples with no biota data (yet)
# Occurrence as a proportion of sampling effort
$op06_ppn <- op06_by_site$OP06/op06_by_site$ss
op06_by_site<- sites[sites$sitecode %in% op06_by_site$sitecode,]
sites_rbaf # merge with sites table and plot on the stream network (see above)
<- merge(op06_by_site,sites)
x <- colorRampPalette(c("#fff7ec", "#7f0000"))
pal $col <-pal(100)[match(round(x$op06_ppn*100),1:100)]
x
<- sf::st_read(db_str,
streams_sampleable query = "SELECT * FROM streams WHERE sampleable = 1;")
<- sf::st_read(db_str, "coast")
coast ::dbDisconnect(db_str)
DBIpar(mar = c(0,0,0,0))
plot(streams_sampleable$geom, col = "lightblue")
plot(coast$geom, add = TRUE)
plot(x$geom, cex = x$ss^0.25, pch = 16, col = x$col, add= TRUE)
legend("bottomleft",pch = 15, pt.cex = 2.75, col = pal(6),
legend = seq(0,1,0.2), title = "Occurrence")