8  Examples of data usage

This chapter provides two examples of appraoches to using the database in R for common applications:

  1. compilation of assemblage data from a subset of samples into taxon-by-sample matrix, a common data format for statistical analysis of assemblage composition;

  2. extraction and mapping of occurrence data for a single taxon.

As described earlier, a simple approach to begin any analysis is to create a connection to the database and read all tables into the R environment.

Once you have access to the database, you need to create a connection to it to read its tables. Here is an example of creating a connection to the gpkg version of the database in R, having downloaded it to a subfolder ‘data’ of the working directory.

db_m <- RSQLite::dbConnect(drv = RSQLite::SQLite(),"data/mwbugs.gpkg")
source(paste0("https://tools.thewerg.unimelb.edu.au/data/mwbugs/",
              "bug_database_functions.R"))
load_all_mwbugs_tables(db_m)
# Always remember to disconnect from database once data is accessed
DBI::dbDisconnect(db_m)

8.1 Example 1: compile a taxon-by-sample table

The following code and comments demonstrate the process of selecting a subset of the macroinvertebrate data using a range of criteria, lumping the taxonomic resolution to a uniform level (family) and producing a family-by-sample matrix (Table 8.1) for analysis.

# Having inspected the sample provenance table [command View(spv)], I 
# selected the 2021-2022 MW biological monitoring program: sourcecode 62
samples_62 <- samples[samples$sourcecode == 62,]  # 72 samples
sites_62 <- sites[sites$sitecode %in% samples_62$sitecode,] # sites

The sites table has mw_subcat and mw_cat fields that refer to Melbourne Water’s management units (which they call subcatchments and catchments). To access the key to the codes used in those fields we first need to access the mw_catchment table from the stream network database. Here I create a connection to the stream network database saved as a gpkg file in the same directory as the mwbugs database.

db_str <- RSQLite::dbConnect(drv = RSQLite::SQLite(),"data/mwstr_v13.gpkg")
mw_subcatchments <- sf::st_read(db_str, "mw_subcatchments") 
# This is a spatial table, thus use st_read rather than dbReadTable
# Having inspected the mw_subcatchments [command View(mw_subcatchments)],  
# I selected samples from the "Emu Creek" mw subcatchment
sites_62y <- sites_62[sites_62$mw_subcat == 29,] # 4 sites
samples_62y <- samples_62[samples_62$sitecode %in% 
                            sites_62y$sitecode,] # 8 samples
# Extract the biological data from the 8 chosen samples
biota_62y <- biota[biota$smpcode %in% samples_62y$smpcode,]
# To check if all records are family level or higher 
# sum(nchar(biota_62y$shortcode) > 4) # 3 shortcodes with >4 characters
# biota_62y[nchar(biota_62y$shortcode) > 4,] # they are records of the 
# # ambiguous taxon Phreatoicidea (taxoncode OR999998) as explained in 
# Best to conservatively lump this to "OR", which is done below
# If there were any other records at lower resolution than family, they 
# could also be lumped to family by creating a famcode as:
biota_62y$famcode <- biota_62y$taxoncode
biota_62y$famcode[nchar(biota_62y$famcode) > 4] <- 
  substr(biota_62y$famcode,1,4)
biota_62y$famcode[substr(biota_62y$famcode,3,4) == "99"] <- 
  substr(biota_62y$famcode,1,2)[substr(biota_62y$famcode,3,4) == "99"]

# Create a family-by-sample matrix with count values 
fam_smp_count <- ct(rows = biota_62y$famcode, cols = biota_62y$smpcode, 
                    values = biota_62y$count)
# Make first column taxon names (Use taxon_all to look up famcodes, as 
# the [as-supplied] taxon values in the biota table may be unreliable)
ct_taxa <- taxon_all$taxon[match(row.names(fam_smp_count),taxon_all$shortcode)]
fam_smp_count <- cbind(data.frame(taxon = ct_taxa), fam_smp_count)
flextable::flextable(fam_smp_count[c(1:3,8,17,61,62),])
Table 8.1: Seven selected rows of the family-by-sample table of counts produced by the example code.

taxon

381-CHA-1268-2-1-AC

381-CHA-1268-2-1-BC

381-EMU-14206-0-1-AC

381-EMU-14206-0-2-AC

381-MAM-1366-8-1-AC

381-MAM-1366-8-2-AC

382-BOL-1267-8-1-AC

382-BOL-1267-8-1-BC

Hydridae

0

0.0

1

0

0.0

0

0

0

Temnocephalidae

0

2.0

0

0

0.0

0

0

0

Dugesiidae

0

0.0

6

0

0.0

0

1

1

Physidae

21

2.0

28

20

0.0

0

0

0

Parastacidae

0

0.1

0

0

0.1

1

0

0

Calocidae

0

0.0

0

0

0.0

0

3

18

Leptoceridae

1

0.0

42

27

5.0

4

3

0

The count data in the table only makes sense accounting for subsample size for these lab-sorted sub-sampled samples. To create an equivalent family-by-sample matrix (Table 8.2) with subsample proportion (not %) values:

biota_62y$ss <- samples_62y$subsample_perc[match(biota_62y$smpcode, 
                                               samples_62y$smpcode)]*0.01
# But note that coarsepick specimens are from whole samples, 
# so ss should be 1 for them
biota_62y$ss[biota_62y$coarsepick == 1] <- 1
# And then make a matching family-by-sample matrix for ss
fam_smp_ss <- ct(rows = biota_62y$famcode, cols = biota_62y$smpcode, 
                    values = biota_62y$ss)
fam_smp_ss <- cbind(data.frame(taxon = ct_taxa), fam_smp_ss)
flextable::flextable(fam_smp_ss[c(1:3,8,17,61,62),])
Table 8.2: Seven selected rows of the family-by-sample subsample proportion (ss) table, with coarse-pick specimens given ss = 1.

taxon

381-CHA-1268-2-1-AC

381-CHA-1268-2-1-BC

381-EMU-14206-0-1-AC

381-EMU-14206-0-2-AC

381-MAM-1366-8-1-AC

381-MAM-1366-8-2-AC

382-BOL-1267-8-1-AC

382-BOL-1267-8-1-BC

Hydridae

0.00

0.0

0.1

0.0

0.0

0.0

0.0

0.0

Temnocephalidae

0.00

0.1

0.0

0.0

0.0

0.0

0.0

0.0

Dugesiidae

0.00

0.0

0.1

0.0

0.0

0.0

0.1

0.1

Physidae

0.15

0.1

0.1

0.1

0.0

0.0

0.0

0.0

Parastacidae

0.00

1.0

0.0

0.0

1.0

0.1

0.0

0.0

Calocidae

0.00

0.0

0.0

0.0

0.0

0.0

0.1

0.1

Leptoceridae

0.15

0.0

0.1

0.1

0.1

0.1

0.1

0.0

8.2 Example 2: A map of taxon occurrences

The following code extracts all presences and absences of a family of interest (Paramelitidae) and produce a map of their proportional occurrences, accounting for subsampling effort (Figure 8.1).

# Find famcode for Paramelitidae
taxon_all[taxon_all$taxon == "Paramelitidae",]  #OP06
     shortcode         taxon     table
1395      OP06 Paramelitidae taxon_fam
# Restrict search to lab-sorted rapid-bioassessment samples
samps_rbaf <- samples[grepl("RBA",samples$collection_method) & 
                  grepl("lab",samples$processing_method),] #3925 samples
# Find samps_rbaf that contain a record of OP06
samps_rbaf_p <- unique(biota$smpcode[biota$smpcode %in% 
          samps_rbaf$smpcode & substr(biota$taxoncode,1,4) ==  "OP06"])
samps_rbaf$OP06 <- 0
samps_rbaf$OP06[samps_rbaf$smpcode %in% samps_rbaf_p] <- 1
# Estimate effective number of samples per site given subsampling effort
samps_rbaf$ss <- samps_rbaf$subsample_perc/100
samps_rbaf$OP06 <- samps_rbaf$OP06* samps_rbaf$ss
# Aggregate effective no. samples and effective no. occurrences by site
op06_by_site <- aggregate(samps_rbaf[c("ss","OP06")],
                          by = list(sitecode = samps_rbaf$sitecode), 
        FUN = sum, na.rm = TRUE) # NAs samples with no biota data (yet)
# Occurrence as a proportion of sampling effort
op06_by_site$op06_ppn <- op06_by_site$OP06/op06_by_site$ss
sites_rbaf <- sites[sites$sitecode %in% op06_by_site$sitecode,]
# merge with sites table and plot on the stream network (see above)
x <- merge(op06_by_site,sites)
pal <- colorRampPalette(c("#fff7ec", "#7f0000"))
x$col <-pal(100)[match(round(x$op06_ppn*100),1:100)]

streams_sampleable <- sf::st_read(db_str, 
                query = "SELECT * FROM streams WHERE sampleable = 1;")
coast <- sf::st_read(db_str, "coast")
DBI::dbDisconnect(db_str)
par(mar = c(0,0,0,0))
plot(streams_sampleable$geom, col = "lightblue")
plot(coast$geom, add = TRUE)
plot(x$geom, cex = x$ss^0.25, pch = 16, col = x$col, add= TRUE)
legend("bottomleft",pch = 15, pt.cex = 2.75, col = pal(6), 
       legend = seq(0,1,0.2), title = "Occurrence")

Figure 8.1: Occurrences of Paramelitidae in all lab-sorted rapid bioassessment samples in the macroinvertebrate database. Point sizes represent the sampling effort at each site and the colors represent occurrence as a proportion of sampling effort.