Filters for Train-Test Split QScript creates two new filters based on a random 70%/30% split of the selected data. These filters can then be applied to predictive models to separate a training data set from a test data set. The QScript can be amended to adjust the split ratio.
Examples
Displayr: How to Split a Sample
Q: Automate > Browse Online Library > Filter > Filters for Train-Test Split
Technical details
The value of trainPercentage in the QScript code below controls the split ratio. The default of 70 means that 70% of the data (rounded to the nearest whole number of instances) is selected as part of the Training split, and the remaining 30% is selected by the Testing split filter.
In Q, the percentages in the training and testing filters can be controlled by adjusting this value as described below in Customizing the QScript.
Code
// This script creates 2 new filters based upon a random split of the data.
includeWeb("QScript Selection Functions");
includeWeb("QScript Functions to Generate Outputs");
filtersForTrainTestSplit();
function filtersForTrainTestSplit() {
const is_displayr = inDisplayr();
// Set percentage of data used for training set
let trainPercentage = prompt("What percentage of the data set should be used as the training set?", 70);
if (trainPercentage < 0 || trainPercentage > 100) {
log("Invalid split. Please ensure that trainPercentage is between 0 and 100.");
return false;
}
// Get the data
let dataFile;
const user_selections = getAllUserSelections()
let selected_questions = user_selections.selected_questions;
if (selected_questions.length > 0)
dataFile = project.report.selectedQuestions()[0].dataFile;
else if (project.dataFiles.length == 1)
dataFile = project.dataFiles[0];
else if (project.dataFiles.length == 0) {
log("Please add a data set.");
return false;
} else if (!is_displayr) {
dataFile = dataFileSelection()[0];
} else {
log("Please select data from a single data set.")
return false;
}
// Create a training filter based on a random sample
let RText = "percentage <- " + trainPercentage + " # Change this number to change the percentage in the training sample\n" +
"set.seed(123) # This ensures that the randomization is identical each time\n" +
"n <- " + dataFile.totalN + " # This is the total sample size\n" +
"indices <- sample.int(n, round(percentage * n / 100))\n" +
"filter <- rep(0, n)\n" +
"filter[indices] <- 1\n" +
"filter";
let new_q_name = preventDuplicateQuestionName(dataFile, "Training sample");
let test;
let train;
try {
train = dataFile.newRVariable(RText, preventDuplicateVariableName(dataFile, "training"), new_q_name, null);
} catch (e) {
log("Could not create train filter: " + e);
return false;
}
train.needsCheck = false;
// Create testing filter of the data not selected by the training filter
RText = "as.numeric(!`" + dataFile.name + "`$Variables$" + train.name + ")"; // backticks allow hyphen in dataFile.fileName
try {
test = dataFile.newRVariable(RText, preventDuplicateVariableName(dataFile, "testing"), "Testing sample", null);
} catch (e)
{
log("Could not create test filter: " + e);
return false;
}
test.needsCheck = false;
// Combine the 2 new variables into a Pick-Any question
trainTest = dataFile.setQuestion(preventDuplicateQuestionName(dataFile, "Train test split"), "Pick Any", [train, test]);
let suffix = trainTest.name.replace(/^Train test split/, "");
trainTest.variables[0].label = "Training sample" + suffix;
trainTest.variables[1].label = "Testing sample" + suffix;
trainTest.needsCheckValuesToCount = false;
trainTest.isFilter = true;
insertAtHoverButtonIfShown(trainTest);
reportNewRQuestion(trainTest, "Filter");
return true;
}