Tidy Data Catalogue

Below is a tool to help explore the data. If you want to documentiation on how the code is structured, read the coding guide

Variables

The table below contains a list of all the tidy variables. These correspond directly to what we asked in the survey or simple transformations of them.

Variations

Note that for a lot of the variables, there are variations within the dataset that are omitted for brevity from the list below. Variations of a variable are denoted by suffix to the original name. For example, for the hypothetical variable income, income_z would denote the z-score of income.

Common suffixes and their definitions:

_z: z-score of the variable based on the baseline distribution
_99 / _95 : variable winsorized at the 99th (or 95th) percentile
_pc: per capita version of the variable (divided by household size)
_ae: adult equivalent version of the variable (divided by adult equivalent household size)

Some index variables exist. The description clarify how they are constructed. Alternatively, their definitions can be found in the do-file by consulting the do-file that generated the relevant dataset.

Code

variables = FileAttachment("library/data_dictionary/output/data_dictionary.csv").csv({ typed: true })

// PDF roots for each survey
baselinePDFRoot = "survey_pdfs/14. vip_baseline.pdf"
phonePDFRoot = "survey_pdfs/15. vip_phone_survey (002)May22.pdf"
endlinePDFRoot = "survey_pdfs/17. vip_endline-v4.pdf"
fcmPDFRoot = "survey_pdfs/fcm.pdf"
pickupPDFRoot = "survey_pdfs/20.PickupDropff-VIP.pdf"
censusPDFRoot = "survey_pdfs/census.pdf" 

// Load all annotation files
baselinePageAnnotations = FileAttachment("library/data_dictionary/output/variable_tags/llm_annotations_baseline.csv").csv({ typed: true })
phonePageAnnotations = FileAttachment("library/data_dictionary/output/variable_tags/llm_annotations_phone_gemini-3-flash-preview.csv").csv({ typed: true })
endlinePageAnnotations = FileAttachment("library/data_dictionary/output/variable_tags/llm_annotations_endline_gemini-3-flash-preview.csv").csv({ typed: true })
fcmPageAnnotations = FileAttachment("library/data_dictionary/output/variable_tags/llm_annotations_fcm_gemini-3-flash-preview.csv").csv({ typed: true })
pickupPageAnnotations = FileAttachment("library/data_dictionary/output/variable_tags/llm_annotations_pickup_gemini-3-flash-preview.csv").csv({ typed: true })
censusPageAnnotations = FileAttachment("library/data_dictionary/output/variable_tags/llm_annotations_census_gemini-3-flash-preview.csv").csv({ typed: true })

// Helper function to find page annotation for a variable
function findPage(annotations, variable_name, dataset_id) {
  let annotation = annotations.find(a => a.variable_name === variable_name && a.dataset_id === dataset_id);
  if (annotation && !isNaN(annotation.predicted_page) && annotation.predicted_page != -1) {
    return annotation.predicted_page;
  }
  return -1;
}

// Helper function to build survey links HTML
function buildSurveyLinks(d) {
  let links = [];
  if (d.baselinePage && d.baselinePage != -1) {
    links.push(`<a target="_blank" href="${baselinePDFRoot}#page=${d.baselinePage}">BL:P${d.baselinePage}</a>`);
  }
  if (d.phonePage && d.phonePage != -1) {
    links.push(`<a target="_blank" href="${phonePDFRoot}#page=${d.phonePage}">Phone:P${d.phonePage}</a>`);
  }
  if (d.endlinePage && d.endlinePage != -1) {
    links.push(`<a target="_blank" href="${endlinePDFRoot}#page=${d.endlinePage}">EL:P${d.endlinePage}</a>`);
  }
  if (d.fcmPage && d.fcmPage != -1) {
    links.push(`<a target="_blank" href="${fcmPDFRoot}#page=${d.fcmPage}">FCM:P${d.fcmPage}</a>`);
  }
  if (d.pickupPage && d.pickupPage != -1) {
    links.push(`<a target="_blank" href="${pickupPDFRoot}#page=${d.pickupPage}">Pickup:P${d.pickupPage}</a>`);
  }
  if (d.censusPage && d.censusPage != -1) {
    links.push(`<a target="_blank" href="${censusPDFRoot}#page=${d.censusPage}">Census:P${d.censusPage}</a>`);
  }
  if (links.length === 0) {
    return '<span style="color: gray;">No question</span>';
  }
  return links.join(' | ');
}

// Merge page annotations into data
data = variables.map(d => {
  d.baselinePage = findPage(baselinePageAnnotations, d.variable_name, d.dataset_id);
  d.phonePage = findPage(phonePageAnnotations, d.variable_name, d.dataset_id);
  d.endlinePage = findPage(endlinePageAnnotations, d.variable_name, d.dataset_id);
  d.fcmPage = findPage(fcmPageAnnotations, d.variable_name, d.dataset_id);
  d.pickupPage = findPage(pickupPageAnnotations, d.variable_name, d.dataset_id);
  d.censusPage = findPage(censusPageAnnotations, d.variable_name, d.dataset_id);
  d.surveyLinks = buildSurveyLinks(d);
  return d;
});

Code

viewof search = Inputs.search(data, {
  placeholder: "Search variables, paths, or labels...",
  label: "Search"
})

viewof selectedDataset = Inputs.select(uniqueDatasets, {
  label: "Dataset",
  value: "All"
})

viewof visibleColumns = Inputs.checkbox(
  ["dataset_id", "variable_name", "variable_label", "surveyLinks"],
  {
    label: "Show Columns",
    value: ["dataset_id", "variable_name", "variable_label", "surveyLinks"],
    format: x => x === "dataset_id" ? "Dataset" : x === "variable_name" ? "Variable" : x === "variable_label" ? "Description" : "Survey Links" 
  }
)

Code

Inputs.table(filteredData, {
  rows: 15,
  layout: "fixed",
  columns: visibleColumns,
  header: {
    dataset_id: "Dataset",
    variable_name: "Variable",
    variable_label: "Description",
    surveyLinks: "Survey Links"
  },
  format: {
    surveyLinks: links => {
      const span = document.createElement('span');
      span.innerHTML = links;
      return span;
    }
  }
})

Datasets

The table below contains a list of all the tidy datasets available in the data repository along with their descriptions and paths. By tidy data, I mean data that corresponds very closely to what we asked directly, with grouped by which module of the survey they come from.

Exceptions

While most of the datasets follow the above principle, there are a few exceptions.

First, some datasets are aggregated versions of the raw data. These exist either for convenience and to ensure that aggregation is done close to where we clean the raw data, to avoid mistakes. Additionally, we collect only aggregated data for some modules in the phone survey; in these cases the in-person data is aggregated and joined with the phone survey.

The column, Survey, denotes which survey the data comes from as well as whether it was aggregated or not.

Another exception regards variable grouping. Some set of variables do not have an obvious way to group them. To avoid creating too many do-files and datasets for one or two questions, I grouped those below into the two datasets below:

07_InPerson_hh_level-hh_id-period -> questions asked to the household at the in-person surveys (baseline and endline)
16_period_hh_level-hh_id-period -> questions asked across all periods (some might be phone survey only)

Code

dataset_list = FileAttachment("library/data_dictionary/output/dataset_statistics.csv").csv({ typed: true })

viewof ds_search = Inputs.search(dataset_list, {
  placeholder: "Search datasets",
  label: "Search"
})

viewof visibleDatasetColumns = Inputs.checkbox(
  ["dataset_id", "description", "variable_count", "row_count", "observation_level", "sources"],
  {
    label: "Show Columns",
    value: ["dataset_id", "description", "variable_count", "row_count", "observation_level", "sources"],
    format: x => x === "dataset_id" ? "Dataset" : x === "variable_count" ? "Variable Count" : x === "row_count" ? "Row Count" : x === "observation_level" ? "Observation Level" : x === "sources" ? "Survey" : "Description"
  }
)
/* The following dummies indicate what the source of the data is:
        phone   baseline    endline fcm timeuse pickup  dropoff beliefs census
    
    I want to create a filter view that allows users to filter by these sources
*/

viewof sourceFilter = Inputs.checkbox(
  [
    "Phone",
    "Baseline",
    "Endline",
    "FCM",
    "Time Use",
    "Pickup",
    "Dropoff",
    "Beliefs",
    "Census"
  ],
  {
    label: "From Survey",
    value: ["Phone", "Baseline", "Endline", "FCM", "Time Use", "Pickup", "Dropoff", "Beliefs", "Census"]
  }
)

filteredDatasetList = {
  let result = ds_search;
  
  if (sourceFilter.length > 0) {
    result = result.filter(d => {
      for (let source of sourceFilter) {
        // Remove spaces and lowercases
        let key = source.toLowerCase().replace(" ", "");
        if (d[key] === 1) {
          return true;
        }
      }
      return false;
    });
  }
  
  return result;
}

Code

Inputs.table(filteredDatasetList, {
  rows: 20,
  columns: visibleDatasetColumns,
  layout: "fixed",
  width: {
    dataset_id: 50,
    description: 300,
    variable_count: 25,
    row_count: 50,
    observation_level: 100,
    sources: 150
  },
  header: {
    description: "Description",
    dataset_id: "Dataset",
    variable_count: "Variable Count",
    row_count: "Observations",
    observation_level: "Observation Level",
    sources: "Survey"
  }
})