This QScript creates three new Filters based on a random 50%/25%/25% split of the selected data. These filters can then be applied to predictive models to separate a training data set from a validation data set and a testing set. The QScript can be amended to adjust the split ratio.
Example
How to Split a Sample in Displayr
Technical details
The values of trainPercentage and validationPercentage in the QScript code below control the split ratio. The defaults of 50 and 25 mean that 50% of the data (rounded to the nearest whole number of instances) is selected as part of the Training split, 25% of the data (also rounded to the nearest whole number of instances) forms part of the Validation split, and the remaining 25% is in the Testing split filter.
In Q, the percentages in the training and testing filters can be controlled by adjusting this value as described below in Customizing the QScript.
Code
// This script creates 3 new filters based upon a random split of the data.
includeWeb("QScript Selection Functions");
includeWeb("QScript Functions to Generate Outputs");
filtersForTrainTestValidationSplit()
function filtersForTrainTestValidationSplit() {
const is_displayr = inDisplayr();
// Set percentage of data used for training and validation set
let trainPercentage = parseFloat(prompt("What percentage of the data set should be used as the training set?", 50));
if (trainPercentage < 0 || trainPercentage > 100) {
log("Invalid split. Please ensure that the training data percentage is between 0 and 100.");
return false;
}
let validationPercentage = parseFloat(prompt("What percentage of the data set should be used as the validation set?", 25));
if (validationPercentage < 0 || validationPercentage > 100) {
log("Invalid split. Please ensure that the validation data percentage is between 0 and 100.");
return false;
}
if (trainPercentage + validationPercentage > 100) {
log("The percentage lf training and validation data should sum to less than 100: " + (trainPercentage + validationPercentage));
return false;
}
// Get the data
let dataFile;
const user_selections = getAllUserSelections()
let selected_questions = user_selections.selected_questions;
if (selected_questions.length > 0)
dataFile = project.report.selectedQuestions()[0].dataFile;
else if (project.dataFiles.length == 1)
dataFile = project.dataFiles[0];
else if (project.dataFiles.length == 0) {
log("Please add a data set.");
return false;
} else if (!is_displayr) {
dataFile = dataFileSelection()[0];
} else {
log("Please select data from a single data set.")
return false;
}
// Create a training filter based on a random sample
let RText = "percentage <- " + trainPercentage + " # Change this number to change the percentage in the training sample\n" +
"set.seed(123) # This ensures that the randomization is identical each time\n" +
"n <- " + dataFile.totalN + " # This is the total sample size\n" +
"indices <- sample.int(n, round(percentage * n / 100))\n" +
"filter <- rep(0, n)\n" +
"filter[indices] <- 1\n" +
"filter";
let new_q_name = preventDuplicateQuestionName(dataFile, "Training sample");
let tempVar = preventDuplicateVariableName(dataFile, "training");
let train;
let test;
let validation;
try {
train = dataFile.newRVariable(RText, tempVar, "Training sample", null);
} catch (e) {
log("Could not create train filter: " + e);
return false;
}
let trainFullName = "`" + dataFile.name + "`$Variables$" + train.name;
// Create a validation filter based on those not selected in training filter
RText = "percentage <- " + validationPercentage + " # Change this number to change the percentage in the validation sample\n" +
"set.seed(123) # This ensures that the randomization is identical each time\n" +
"n <- " + dataFile.totalN + " # This is the total sample size\n" +
"n.remaining <- n - sum(" + trainFullName + ")\n" +
"indices <- sample.int(n.remaining, round(percentage * n / 100))\n" +
"filter <- rep(0, n)\n" +
"filter[" + trainFullName + " == 0][indices] <- 1\n" +
"filter";
try {
validation = dataFile.newRVariable(RText, preventDuplicateVariableName(dataFile, "validation"), "Validation sample", null);
} catch (e) {
log("Could not create validation filter: " + e);
return false;
}
// Create test filter from those not selected in either training or validation filters
RText = "as.numeric(!(" + trainFullName + " + `" + dataFile.name + "`$Variables$" + validation.name + "))";
try {
test = dataFile.newRVariable(RText, preventDuplicateVariableName(dataFile, "testing"), "Testing sample", null);
} catch (e) {
log("Could not create test filter: " + e);
return false;
}
// Combine the 3 new variables into a Pick-Any question
trainValidateTest = dataFile.setQuestion(preventDuplicateQuestionName(dataFile,"Train validate test split"),
"Pick Any", [train, validation, test])
let suffix = trainValidateTest.name.replace(/^Train validate test split/, "");
trainValidateTest.variables[0].label = "Training sample" + suffix;
trainValidateTest.variables[1].label = "Validation sample" + suffix;
trainValidateTest.variables[2].label = "Testing sample" + suffix;
trainValidateTest.isFilter = true;
setCountThisValueForVariablesInQuestion(trainValidateTest, 1, true);
trainValidateTest.needsCheckValuesToCount = false;
insertAtHoverButtonIfShown(trainValidateTest);
reportNewRQuestion(trainValidateTest, "Train validate test split");
return true;
}