suppressPackageStartupMessages({
library(ipumsr)
library(tidyverse)
})
extract_definition_path <- "data/prcs_migration_analysis.json"
data_path <- "data/prcs_migration_analysis.dat.gz"
ddi_path <- "data/prcs_migration_analysis.xml"
This analysis of IPUMS data is designed to be shared, and thus does not assume that you have already downloaded the data used in the analysis. The code below checks whether the data are already downloaded, and if they aren’t, it submits a new IPUMS extract request according to the specifications in the included extract definition JSON file.
# Define path to "waiting_for_extract" flag file
data_dir <- dirname(extract_definition_path)
waiting_for_extract_path <- file.path(data_dir, "waiting_for_extract.txt")
# Ensure the JSON extract definition is present
json_file_exists <- file.exists(extract_definition_path)
if (!json_file_exists) {
stop(
"File '", extract_definition_path, "' not found; make sure that ",
"`extract_definition_path` is the path to the .json extract definition ",
"file.", call. = FALSE
)
}
# Are the data downloaded, or are we waiting for an extract?
data_not_downloaded <- !file.exists(data_path)
data_downloaded <- file.exists(data_path)
waiting_for_extract <- file.exists(waiting_for_extract_path)
# Ensure that IPUMS_API_KEY environment variable is defined
ipums_api_key_undefined <- Sys.getenv("IPUMS_API_KEY") == ""
if (data_not_downloaded & ipums_api_key_undefined) {
stop(
"Environment variable 'IPUMS_API_KEY' is undefined. Make sure you've ",
"followed the instructions under 'Setting up your API key' in the ",
"ipumsr API vignette (`vignette(\"ipums-api\", package = \"ipumsr\")`) ",
"before running this script.", call. = FALSE
)
}
# If not yet waiting for extract, create and submit a new extract and create
# the "waiting_for_extract" flag file
if (data_not_downloaded & !waiting_for_extract) {
extract_definition <- define_extract_from_json(extract_definition_path)
submitted_extract <- submit_extract(extract_definition)
writeLines(
paste0(submitted_extract$collection, ":", submitted_extract$number),
con = waiting_for_extract_path
)
waiting_for_extract <- TRUE
}
# If waiting for an extract, read extract ID from flag file and check the status
if (data_not_downloaded & waiting_for_extract) {
extract_id <- readLines(waiting_for_extract_path)
extract_info <- get_extract_info(extract_id)
extract_is_ready <- is_extract_ready(extract_info)
extract_is_stale <- !extract_is_ready & extract_info$status == "completed"
if (extract_is_stale) {
stop(
paste0(
"The data files for ", extract_info$collection, " extract number ",
extract_info$number, " have been removed from IPUMS servers. ",
"Please delete the file '", waiting_for_extract_path, "' and re-run ",
"the template."
),
call. = FALSE
)
}
# If the extract is ready, download files and rename to match the JSON file,
# then delete the waiting_for_extract flag file
if (extract_is_ready) {
orig_ddi_path <- download_extract(extract_info, download_dir = data_dir)
orig_data_path <- gsub("\\.xml$", ".dat.gz", orig_ddi_path)
ddi_file_successfully_renamed <- file.rename(orig_ddi_path, ddi_path)
data_file_successfully_renamed <- file.rename(orig_data_path, data_path)
if (!ddi_file_successfully_renamed || !data_file_successfully_renamed) {
stop(
"Problem renaming DDI and/or data file; please report bug at ",
"https://github.com/ipums/ipumsr/issues, including a copy of this ",
"file if possible.", call. = FALSE
)
}
data_downloaded <- TRUE
waiting_file_successfully_removed <- file.remove(waiting_for_extract_path)
if (!waiting_file_successfully_removed) {
stop(
"Unable to remove 'waiting_for_extract.txt'; please report bug at ",
"https://github.com/ipums/ipumsr/issues, including a copy of this ",
"file if possible.", call. = FALSE
)
}
} else { # If extract is not ready, stop execution
stop(
"NOT AN ERROR: ", extract_info$collection, " extract number ",
extract_info$number, " is not yet ready to download. Try ",
"re-running again later.", call. = FALSE
)
}
}
ddi <- read_ipums_ddi(ddi_path)
data <- read_ipums_micro(ddi, data_file = data_path)
## Use of data from IPUMS USA is subject to conditions including that users should
## cite the data appropriately. Use command `ipums_conditions()` for more details.
The percentage of people who had moved in the last year increased between 2017 and 2018 from about 6% to over 8% among all persons in Puerto Rico, but the magnitude of this trend varies by education, household income, and age.
Note: These graphs show trends in point estimates from sample data, without displaying estimates of sampling error. Differences over time or across groups may not be statistically significant. To calculate confidence intervals for point estimates, follow the IPUMS USA instructions for using replicate weights.
# Prep education variable
college_regex <- "^[123] year(s)? of college$"
data <- data %>%
mutate(
EDUCD3 = EDUCD %>%
lbl_collapse(~.val %/% 10) %>%
lbl_relabel(
lbl(2, "Less than High School") ~.val > 0 & .val < 6,
lbl(3, "High school") ~.lbl == "Grade 12", #<<
lbl(4, "Some college") ~str_detect(.lbl, college_regex), #<<
lbl(5, "College or more") ~.val %in% c(10, 11)
) %>%
as_factor()
)
# Prep income variable
value_to_quintile <- function(x) {
cut_points <- quantile(x, probs = c(0.2, 0.4, 0.6, 0.8), na.rm = TRUE)
cut(
x,
breaks = c(-Inf, cut_points, Inf),
labels = c("Lowest", "Lower Middle", "Middle", "Upper Middle", "Highest"),
ordered_result = TRUE
)
}
hhincome_quintiles <- data %>%
filter(PERNUM == 1 & HHINCOME != 9999999) %>%
group_by(YEAR) %>%
mutate(hhincome_quintile = value_to_quintile(HHINCOME)) %>%
ungroup() %>%
select(YEAR, SERIAL, hhincome_quintile)
data <- data %>%
left_join(hhincome_quintiles, by = c("YEAR", "SERIAL"))
# Prep migration variable
data <- data %>%
mutate(
moved_in_last_year = case_when(
MIGRATE1 %in% c(0, 9) ~ NA,
MIGRATE1 == 1 ~ FALSE,
MIGRATE1 %in% 2:4 ~ TRUE
)
)
# Prep age variable
age_to_age_group <- function(x) {
cut_points <- c(9, 17, 34, 64)
cut(
x,
breaks = c(-Inf, cut_points, Inf),
labels = c("0-9", "10-17", "18-34", "35-64", "65+"),
ordered_result = TRUE
)
}
data <- data %>%
mutate(age_group = age_to_age_group(AGE))