Filters for Train-Validation-Test Split – Technical Documentation

This QScript creates three new Filters based on a random 50%/25%/25% split of the selected data. These filters can then be applied to predictive models to separate a training data set from a validation data set and a testing set. The QScript can be amended to adjust the split ratio.

Example

How to Split a Sample in Displayr

Technical details

The values of trainPercentage and validationPercentage in the QScript code below control the split ratio. The defaults of 50 and 25 mean that 50% of the data (rounded to the nearest whole number of instances) is selected as part of the Training split, 25% of the data (also rounded to the nearest whole number of instances) forms part of the Validation split, and the remaining 25% is in the Testing split filter.

In Q, the percentages in the training and testing filters can be controlled by adjusting this value as described below in Customizing the QScript.

Code

// This script creates 3 new filters based upon a random split of the data.
 
includeWeb("QScript Selection Functions");
includeWeb("QScript Functions to Generate Outputs"); 
 
filtersForTrainTestValidationSplit()
 
 
function filtersForTrainTestValidationSplit() {
    
    const is_displayr = inDisplayr();

    // Set percentage of data used for training and validation set
    let trainPercentage = parseFloat(prompt("What percentage of the data set should be used as the training set?", 50));
    if (trainPercentage < 0 || trainPercentage > 100) {
        log("Invalid split.  Please ensure that the training data percentage is between 0 and 100.");
        return false;
    }
    let validationPercentage = parseFloat(prompt("What percentage of the data set should be used as the validation set?", 25));
    if (validationPercentage < 0 || validationPercentage > 100) {
        log("Invalid split.  Please ensure that the validation data percentage is between 0 and 100.");
        return false;
    }    
    if (trainPercentage + validationPercentage > 100) {
        log("The percentage lf training and validation data should sum to less than 100: " + (trainPercentage + validationPercentage));
        return false;
    }
    
    // Get the data
    let dataFile;
    const user_selections = getAllUserSelections()
    let selected_questions = user_selections.selected_questions;
    if (selected_questions.length > 0)
        dataFile = project.report.selectedQuestions()[0].dataFile;
    else if (project.dataFiles.length == 1)
        dataFile = project.dataFiles[0];
    else if (project.dataFiles.length == 0) {
        log("Please add a data set.");
        return false;
    } else if (!is_displayr) {
        dataFile = dataFileSelection()[0];
    } else {
        log("Please select data from a single data set.")
        return false;
    }
   
    // Create a training filter based on a random sample
    let RText = "percentage <- " + trainPercentage + " # Change this number to change the percentage in the training sample\n" +
                "set.seed(123) # This ensures that the randomization is identical each time\n" +
                "n <- " + dataFile.totalN + " # This is the total sample size\n" +
                "indices <- sample.int(n, round(percentage * n / 100))\n" +
                "filter <- rep(0, n)\n" +
                "filter[indices] <- 1\n" + 
                "filter";
    let new_q_name = preventDuplicateQuestionName(dataFile, "Training sample");
    let tempVar = preventDuplicateVariableName(dataFile, "training");

    let train;
    let test;
    let validation;

    try {
        train = dataFile.newRVariable(RText, tempVar, "Training sample", null);
    } catch (e) {
        log("Could not create train filter: " + e);
        return false;
    }
    let trainFullName = "`" + dataFile.name + "`$Variables$" + train.name;
 
    // Create a validation filter based on those not selected in training filter
    RText = "percentage <- " + validationPercentage + " # Change this number to change the percentage in the validation sample\n" +
                "set.seed(123) # This ensures that the randomization is identical each time\n" +
                "n <- " + dataFile.totalN + " # This is the total sample size\n" +
                "n.remaining <- n - sum(" + trainFullName + ")\n" +
                "indices <- sample.int(n.remaining, round(percentage * n / 100))\n" +
                "filter <- rep(0, n)\n" +
                "filter[" + trainFullName + " == 0][indices] <- 1\n" +
                "filter";
    try {
        validation = dataFile.newRVariable(RText, preventDuplicateVariableName(dataFile, "validation"), "Validation sample", null);
    } catch (e) {
        log("Could not create validation filter: " + e);
        return false;
    }
 
    // Create test filter from those not selected in either training or validation filters
    RText = "as.numeric(!(" + trainFullName + " + `" + dataFile.name + "`$Variables$" + validation.name + "))";
    try {
        test = dataFile.newRVariable(RText, preventDuplicateVariableName(dataFile, "testing"), "Testing sample", null);
    } catch (e) {
        log("Could not create test filter: " + e);
        return false;
    }
 
    // Combine the 3 new variables into a Pick-Any question
    trainValidateTest = dataFile.setQuestion(preventDuplicateQuestionName(dataFile,"Train validate test split"),
                                             "Pick Any", [train, validation, test])
    let suffix = trainValidateTest.name.replace(/^Train validate test split/, "");
    trainValidateTest.variables[0].label = "Training sample" + suffix;
    trainValidateTest.variables[1].label = "Validation sample" + suffix;
    trainValidateTest.variables[2].label = "Testing sample" + suffix;
    trainValidateTest.isFilter = true;
    setCountThisValueForVariablesInQuestion(trainValidateTest, 1, true);
    trainValidateTest.needsCheckValuesToCount = false;
    insertAtHoverButtonIfShown(trainValidateTest); 
    reportNewRQuestion(trainValidateTest, "Train validate test split");
    return true;
}

Articles in this section

Example

Technical details

Code

Related articles