library(here)
here() starts at /Users/ads7fg/git/data.USGS
library(XML)
library(xml2)
library(data.table)
library(stringr)
library(readr)
library(here)
here() starts at /Users/ads7fg/git/data.USGS
library(XML)
library(xml2)
library(data.table)
library(stringr)
library(readr)
download.file(
"https://www.sciencebase.gov/catalog/file/get/677eaf95d34e760b392c4970?f=__disk__70%2Fcf%2F36%2F70cf3695ad9405884df4a4758e4b609013e3fb1e",
here("data/ingest/usgs_mineral_2025.zip")
)unzip(here("data/ingest/usgs_mineral_2025.zip"), exdir = here("data/ingest/usgs_mineral_2025/"))
unlink(here("data/ingest/usgs_mineral_2025.zip"))
download.file(
"https://www.sciencebase.gov/catalog/file/get/65a6e45fd34e5af967a46749?f=__disk__9c%2F08%2Fa8%2F9c08a8bb0d04a87840f4f7f4ba489adf0d277f32",
here("data/ingest/usgs_mineral_2024.zip")
)unzip(here("data/ingest/usgs_mineral_2024.zip"), exdir = here("data/ingest/usgs_mineral_2024/"))
unlink(here("data/ingest/usgs_mineral_2024.zip"))
download.file(
"https://www.sciencebase.gov/catalog/file/get/63b5f411d34e92aad3caa57f?f=__disk__52%2F8b%2Ff6%2F528bf6c0a324c889837f857d5fe7acf8d5a9965c",
here("data/ingest/usgs_mineral_2023.zip")
)unzip(here("data/ingest/usgs_mineral_2023.zip"), exdir = here("data/ingest/usgs_mineral_2023/"))
unlink(here("data/ingest/usgs_mineral_2023.zip"))
download.file(
"https://www.sciencebase.gov/catalog/file/get/6197ccbed34eb622f692ee1c?f=__disk__ef%2Fa8%2F27%2Fefa827f9cad2012de291a711b2a073b0a7cd4aa5",
here("data/ingest/usgs_mineral_2022.zip")
)unzip(here("data/ingest/usgs_mineral_2022.zip"), exdir = here("data/ingest/usgs_mineral_2022/"))
unlink(here("data/ingest/usgs_mineral_2022.zip"))
<- list.files(here("data/ingest"), pattern = "*.xml", full.names = T, recursive = T) xml_files
<- data.table(year = character(), mineral = character(), unit = character(), filename = character()) mineral_files
for (f in 1:length(xml_files)) {
# EXTRACT YEAR FROM FILENAME
<- str_extract(xml_files[f], "[1-9][0-9][0-9][0-9]")
yr # ONLY PROCESS =< 2024
if (yr > 2024) break
# PROCESS XML TO R OBJECTS
<- read_file(xml_files[f])
xml_txt <- str_replace_all(xml_txt, "\\&", "\\&")
xml_txt <- read_xml(xml_txt)
xml
<- xml_find_all(xml, ".//attr")
attrs <- xml_find_all(xml, ".//enttyp")
ents <- xml_find_all(xml, ".//attrunit")
attrunit <- as_list(xml_find_all(xml, ".//title")[1])[[1]][[1]]
title
<- as_list(attrs)
attrs_list <- as_list(ents)
ents_list <- as_list(attrunit)
attrunit_list
# SET MINERAL NAME
for (i in 1:length(attrs_list)) {
<- attrs_list[[i]]
a if (a$attrlabl[[1]] == "Commodity") {
# print(a$attrdomv$edom$edomv[[1]])
<- toupper(a$attrdomv$edom$edomv[[1]])
mineral break
else if (str_detect(title, "Data Release")) {
} <- as_list(xml_find_all(xml, ".//title")[1])[[1]][[1]]
x <- substr(x, str_locate(x, "[A-Z][A-Z]+")[1], str_locate(x, " Data Release")[1] - 1)
mineral else {
} <- "mineral not found"
mineral
}
}# print(mineral)
# SET FILENAME
for (j in 1:length(ents_list)) {
<- ents_list[[j]]$enttypl[[1]]
e if (grepl("world", e, fixed = TRUE)) {
# print(e)
<- e
filename break
else {
} <- "filename not found"
filename
}
}# print(filename)
# SET UNIT
for (k in 1:length(attrunit_list)) {
<- trimws(attrunit_list[[k]][[1]][[1]][[1]])
u if (u != "" & !u %like% ",") {
<- toupper(u)
unit break
else {
} <- "unit not found"
unit
}
}# print(unit)
# IF BOTH MINERAL AND FILENAME ARE SET, ADD TO data.table
if (mineral != "not found" && filename != "filename not found") {
<- data.table(year = yr, mineral = mineral, unit = unit, filename = filename)
dt <- rbindlist(list(mineral_files, dt))
mineral_files
} }
fwrite(mineral_files, here("data/working/mineral_file_names_before_2025.csv"))
The data files are varied in their format. There was no followed standard for column number or column names. We convert all data files to long (tidy) format - one reading/measurement/comment per row. Doing so gives each dataset the same set of columns (Commodity, Country, Type, Meas_Unit, Meas_Name, and Value) so that they can be merged. The measure name (Meas_Name) is the former name of the data column. For example, if a data column was named “Cap_t_est_2020”, that name now becomes Meas_Name (and each data entry gets its own row).
<- data.table(
dt_lng Source = character(),
Commodity = character(),
Country = character(),
Type = character(),
Meas_Unit = character(),
Meas_Name = character(),
Value = character()
)
<- fread(here("data/working/mineral_file_names_before_2025.csv")) file_vars
# FOR EACH ROW IN file_vars
for (i in 1:nrow(file_vars)) {
<- file_vars[i]$filename
fname # print(fname)
# FIX FILE NAMING ERROS BEFORE PROCESSING FILE
<- str_replace(fname, "-world", "_world")
fname <- str_replace(fname, "mgcom_", "mgcomp_")
fname <- str_replace(fname, "mcs2023-zirco_", "mcs2023-zirco-hafni_")
fname <- str_replace(fname, "mcs2024-bismu_world", "MCS2024-bismu_world")
fname if (substr(fname, 8, 8) == "_") {
<- str_replace(fname, "_", "-")
fname
}
# LOCATE AND READ FILE
<- list.files(here("data/ingest"), pattern = fname, full.names = T, recursive = T)
fpath <- fread(fpath, header = T, colClasses = c("character"))
d
# IF COLUMN NAME = "Form" CHANGE TO "Type"
if ("Form" %in% colnames(d)) setnames(d, "Form", "Type")
# IF COLUMN NAME = "Mine production, fluorspar" CHANGE TO "Type"
if ("Mine production, fluorspar" %in% colnames(d)) setnames(d, "Mine production, fluorspar", "Type")
# ADD Commodity COLUMN USING file_vars
:= file_vars[i]$mineral]
d[, Commodity
# ADD Meas_Unit COLUMN USING file_vars
:= file_vars[i]$unit]
d[, Meas_Unit
# CREATE/REPLACE SOURCE COLUMN (sometimes doesn't exist)
$Source <- toupper(substr(basename(fpath), 1, 7))
d
# TRANSPOSE TO LONG FORMAT
<- melt(d,
m id.vars = c("Source", "Commodity", "Country", "Type", "Meas_Unit"),
variable.name = "Meas_Name",
value.name = "Value",
variable.factor = F
)
# REMOVE NAs
<- m[!is.na(Value), ]
m
# REMOVE IF COUNTY IS MISSING
<- m[Country != "", ]
m
# ADD TO data.table
<- rbindlist(list(dt_lng, m))
dt_lng }
# ORDER COLUMNS
<- dt_lng[, .(Source, Commodity, Country, Type, Meas_Unit, Meas_Name, Value)]
dt2224 # CAPITALIZE COLUMN NAMES
colnames(dt2224) <- toupper(colnames(dt2224))
# LOAD 2025 FILE
<- fread(here("data/ingest/usgs_mineral_2025/MCS2025_World_Data.csv")) d25
toupper(COMMODITY) %like% "TITAN" & toupper(COUNTRY) %like% "SIERRA", UNIT_MEAS := "metric tons"] d25[
# MELT
<- melt(d25,
d25m id.vars = c("SOURCE", "COMMODITY", "COUNTRY", "TYPE", "UNIT_MEAS"),
variable.name = "MEAS_NAME",
value.name = "VALUE"
)
Warning in melt.data.table(d25, id.vars = c("SOURCE", "COMMODITY", "COUNTRY", :
'measure.vars' [PROD_2023, PROD_EST_ 2024, PROD_NOTES, CAP_2023, ...] are not
all of the same type. By order of hierarchy, the molten data value column will
be of type 'character'. All measure variables not of type 'character' will be
coerced too. Check DETAILS in ?melt.data.table for more on coercion.
# UPDATE COLUMN NAME
setnames(d25m, "UNIT_MEAS", "MEAS_UNIT")
# CAPITALIZE MEAS_UNIT
:= toupper(MEAS_UNIT)] d25m[, MEAS_UNIT
# COMBINE 2022-2025
<- rbindlist(list(dt2224, d25m))
dt2225
# CONVERT FACTOR COLUMN TO CHARACTER (ANNOYING)
$MEAS_NAME <- as.character(dt2225$MEAS_NAME) dt2225
# ADD SRC_YR COLUMN
:= substr(SOURCE, 4, 7)]
dt2225[, SRC_YR
# ADD MEAS_TYPE COLUMN
toupper(MEAS_NAME) %like% "^PROD", MEAS_TYPE := "Production"]
dt2225[toupper(MEAS_NAME) %like% "^RESERV", MEAS_TYPE := "Reserve"]
dt2225[toupper(MEAS_NAME) %like% "^CAP" | toupper(TYPE) %like% "CAPACITY", MEAS_TYPE := "Capacity"]
dt2225[
# ADD ESTIMATED COLUMN
toupper(MEAS_NAME) %like% "EST", ESTIMATED := "Y"]
dt2225[
# ADD MEAS_YR COLUMN
!is.na(MEAS_UNIT), MEAS_YR := substr(MEAS_NAME, nchar(MEAS_NAME) - 3, nchar(MEAS_NAME))]
dt2225[!is.na(MEAS_UNIT) & toupper(MEAS_NAME) %like% "RESERV", MEAS_YR := as.integer(SRC_YR) - 1] dt2225[
# remove leading and following whitespace
:= trimws(VALUE)]
dt2225[, VALUE
# remove leading "e"
%like% "^e[0-9]", VALUE := substr(VALUE, 2, nchar(VALUE))]
dt2225[VALUE
# remove empty notes
<- dt2225[!(is.na(MEAS_UNIT) & VALUE == ""), ]
dt2225
# notes don't have measurements
toupper(MEAS_NAME) %like% "NOTE", MEAS_UNIT := NA]
dt2225[toupper(MEAS_NAME) %like% "NOTE", MEAS_YR := NA]
dt2225[
# remove meas_name = "V9" - from Barite file
<- dt2225[!MEAS_NAME %like% "^V"]
dt2225
# remove gt and lt symbols in VALUE
%like% "^>", VALUE := substr(VALUE, 2, nchar(VALUE))]
dt2225[VALUE %like% "^<", VALUE := substr(VALUE, 2, nchar(VALUE))]
dt2225[VALUE
# substitute and for & in COMMODITY
%like% "&", COMMODITY := str_replace_all(COMMODITY, "&", "and")]
dt2225[COMMODITY
# remove where VALUE equals "XX" (meaning invalid)
<- dt2225[!VALUE == "XX"]
dt2225
# remove if NA or blank in VALUE
<- dt2225[!is.na(VALUE) & VALUE != ""]
dt2225
# capitalize COMMODITY, COUNTRY, and MEAS_TYPE
:= toupper(COMMODITY)]
dt2225[, COMMODITY := toupper(COUNTRY)]
dt2225[, COUNTRY := toupper(MEAS_TYPE)]
dt2225[, MEAS_TYPE
# add space before open parentheses
%like% "[A-Z]\\(", COMMODITY := str_replace(COMMODITY, "([A-Z])\\(", "\\1 (")]
dt2225[COMMODITY
# double dash to single and add space before and after
%like% "\\)--[A-Z]", COMMODITY := str_replace(COMMODITY, "--", " - ")] dt2225[COMMODITY
== "BAUXITE AND ALUMINA", COMMODITY := "BAUXITE"]
dt2225[COMMODITY == "NIOBIUM (COLUMBIUM)", COMMODITY := "NIOBIUM"]
dt2225[COMMODITY == "MICA", COMMODITY := "MICA (NATURAL)"]
dt2225[COMMODITY == "GRAPHITE (NATURAL)", COMMODITY := "GRAPHITE"]
dt2225[COMMODITY == "FELDSPAR AND NEPHELINE SYENITE", COMMODITY := "FELDSPAR"]
dt2225[COMMODITY == "TALC AND PYROPHYLLITE", COMMODITY := "TALC, CRUDE"]
dt2225[COMMODITY == "ABRASIVES (MANUFACTURED)", COMMODITY := "ABRASIVES"] dt2225[COMMODITY
== "GEMSTONES", MEAS_UNIT := toupper("thousands of carats of gem-quality diamond")]
dt2225[COMMODITY == "MAGNESIUM COMPOUNDS", MEAS_UNIT := toupper("thousand metric tons")]
dt2225[COMMODITY == "POTASH", MEAS_UNIT := "THOUSAND METRIC TONS"]
dt2225[COMMODITY == "SELENIUM", MEAS_UNIT := "METRIC TONS"] dt2225[COMMODITY
# remove accents
:= stringi::stri_trans_general(COUNTRY, "Latin-ASCII")]
dt2225[, COUNTRY
# only use country names from list
<- read.csv(here("country_names.csv"))
country_names for (nm in country_names[, 1]) {
%like% paste0("^", nm) & !COUNTRY == "UNITED STATES AND CANADA", COUNTRY := nm]
dt2225[COUNTRY
}
# dt2225[COUNTRY %like% "[0-9]$", COUNTRY := stringr::str_remove_all(COUNTRY, "[0-9]")]
# dt2225[COUNTRY %like% "\\)$", COUNTRY := stringr::str_replace(COUNTRY, "(.*)\\(.*", "\\1")]
# dt2225[, COUNTRY := stringr::str_replace_all(COUNTRY, "REPUBLIC OFE", "REPUBLIC OF")]
# dt2225[(!COUNTRY %like% "KOREA" & !COUNTRY %like% "WORLD") & COUNTRY %like% ",", COUNTRY := stringr::str_replace(COUNTRY, "(.*),.*", "\\1")]
:= trimws(COUNTRY)] dt2225[, COUNTRY
# ORDER COLUMNS
<- dt2225[, .(SOURCE, COMMODITY, COUNTRY, TYPE, MEAS_UNIT, MEAS_NAME, MEAS_TYPE, SRC_YR, MEAS_YR, ESTIMATED, VALUE)]
dt2225
# WRITE FILE
fwrite(dt2225, here("data/output/world_mineral_commodity_reports_2022-2025.csv"))