Detecting Outliers

Bill_French · ‎Feb 20, 2020

In my data science work, I often need to find outliers. There are many ways to do this and one might think it can be done reliably using a simple filter - e.g.,

Show me anything that falls outside this range.

This assumes (a) you know the correct range of outliers, and (b) the context for the range is never likely to change. In practice, your data will prove you wrong about these assumptions time and time-again.

In cases where transaction flow is small, a quick scan will usually suffice to see if your data passes the sniff-test. But when you have many thousands of records and dozens of fields to consider, you might want to audit your data with powerful statistical analysis.

Ideally, you need a process that can learn based on all of the data available and apply its rules consistently, even when you’re not in the room.

To that end, here’s a script block that uses the multiple interquartile methodology to find outliers. Once again, this is a pretty good use of a Script Block because doing this with formulas is likely impossible. Full source is provided … enjoy.

/*

   ***********************************************************
   Airdrop - Outlier Detection
   Copyright (c) 2020 by Global Technologies Corporation
   ALL RIGHTS RESERVED
   ***********************************************************
   
*/

// display app title
output.markdown('# AirDrop Outlier Detection');

// display app introduction
output.markdown('This Outlier Dection app scans a table and a selected number column looking for values that are not expected.');

// prompt for the table name
let sourceTblName = await input.textAsync('Enter the Table Name to Scan');
try {
  var sourceTable = base.getTable(sourceTblName);
} catch(e) {
  output.markdown("ERROR: @ " + e.stack + " :: " + e.message + " (22)");
  return(false);
}

// prompt for the number column name
let sourceColName = await input.textAsync('Enter the Field Name to Scan (IMPORTANT: Must be a Number Field!)');

// prompt for a reference column name
let sourceRefName = await input.textAsync('Enter the Reference Field Name to associate with the ' + sourceColName + ' data.');

// get the data set
try {
  var result = await sourceTable.selectRecordsAsync();
} catch(e) {
  output.markdown("ERROR: @ " + e.stack + " :: " + e.message + " (36)");
  return(false);
}

// iterate across all the records
var aData = [];
try {
  for (let record of result.records)
  {
    if (record.getCellValue(sourceColName))
      aData.push([record.getCellValue(sourceRefName), record.getCellValue(sourceColName)]);
  }
} catch(e) {
  output.markdown("ERROR: @ " + e.stack + " :: " + e.message + " (49)");
  return(false);
}

// test for outliers
var aAnomalies = findAnomalies2D(aData, 1, 1);
var minValue   = aAnomalies[0];
var maxValue   = aAnomalies[1];
var aResults   = aAnomalies[2];

// filter for the outliers
var oOutliers = [];
var count = 0;
for (var i = 0; i < aData.length; i++)
{
  if (aData[i][2] != "Inlier")
  {
    oOutliers[i.toString()] = {};
    oOutliers[i.toString()][sourceRefName] = aData[i][0].toString();
    oOutliers[i.toString()][sourceColName] = aData[i][1].toFixed(2).replace(/(\d)(?=(\d\d\d)+(?!\d))/g, "$1,");
    oOutliers[i.toString()]["Anomaly"]     = aData[i][3].toString().replace("+", "High").replace("-", "Low").replace("x", "Normal");
    oOutliers[i.toString()]["Deviation"]   = (aData[i][3] == "+") ? (aData[i][1] - maxValue).toFixed(0).replace(/(\d)(?=(\d\d\d)+(?!\d))/g, "$1,") : (aData[i][1] - minValue).toFixed(0).replace(/(\d)(?=(\d\d\d)+(?!\d))/g, "$1,");
    count += 1;
  }
}

output.markdown('## Inliers: ' + (aData.length - count).toString() + " ... Outliers: " + count.toString() + " (" + ((count / aData.length) * 100).toFixed(1).toString() + "%)");

// output.markdown(count.toString());
output.table(oOutliers);

function findAnomalies2D(aArray, valsCol, lastCol) {
  
  // if there's not at least 4 rows...
  if (aArray.length < 4)
    return([0, 0, aArray]);

  let values, q1, q3, iqr, maxValue, minValue;

  // build the values array
  var aValues = [];
  for (var i = 0; i < aArray.length; i++)
  {
    aValues.push(aArray[i][valsCol]);
  }

  // sort the dataframe
  aValues = aValues.sort((a, b) => a - b);

  //
  // determine the quartiles based on multiple of the inter-quartile range
  //
  if((aValues.length / 4) % 1 === 0){
    q1 = 1/2 * (aValues[(aValues.length / 4)] + aValues[(aValues.length / 4) + 1]);
    q3 = 1/2 * (aValues[(aValues.length * (3 / 4))] + aValues[(aValues.length * (3 / 4)) + 1]);
  } else {
    q1 = aValues[Math.floor(aValues.length / 4 + 1)];
    q3 = aValues[Math.ceil(aValues.length * (3 / 4) + 1)];
  }

  //
  // determine the min/max range
  //
  iqr = q3 - q1;
  maxValue = q3 + iqr * 1.5;
  minValue = q1 - iqr * 1.5;

  //
  // add the classifications to the dataframe
  //
  for (var i = 0; i < aArray.length; i++)
  {
    if (aArray[i][valsCol] <= minValue) {
      aArray[i][lastCol + 1] = "Outlier";
      aArray[i][lastCol + 2] = "-";
      aArray[i][lastCol + 3] = (minValue - aArray[i][valsCol]);
    } else if (aArray[i][valsCol] >= maxValue) {
      aArray[i][lastCol + 1] = "Outlier";
      aArray[i][lastCol + 2] = "+";
      aArray[i][lastCol + 3] = (aArray[i][valsCol] - maxValue);
    } else {
      aArray[i][lastCol + 1] = "Inlier";
      aArray[i][lastCol + 2] = "x";
      aArray[i][lastCol + 3] = 0;
    }
  }

  return([minValue, maxValue, aArray]);

}

function sort2DByColumn(a, b)
{
  if (a[1] === b[1]) {
    return 0;
  }
  else {
    return (a[1] < b[1]) ? -1 : 1;
  }
}