Feb 22, 2024 05:08 PM
Hello,
I have a DB ~15,000 records. I wrote a script to merge and delete records. It compares three data fields for duplications and combines the duplicate records. It also will merge data from two other fields if the fields are not empty.
I tested my script on a 500 record DB and it works fine. When I test on a 2,500 record DB it times out.
Can someone make any suggestions on how I can cut the execution time on my script ?
I've seen posts about using a hash index, but I don't think that would work for what I want to do. Because I need to iterate through all records to find duplicates. If I'm wrong I'd appreciate insight.
var table = base.getTable('Testing_Dupes');
var query = await table.selectRecordsAsync();
//find records with a duplicate (first name, last name, residence)
let dupli = query.records.filter((record) =>{
return query.records.find((potDupe)=>{
return record.getCellValue("First Name") === potDupe.getCellValue("First Name") && record.id !== potDupe.id && record.getCellValue("Last Name") === potDupe.getCellValue("Last Name") && record.getCellValue("Residence") === potDupe.getCellValue("Residence");
})
});
//one array for each field with updates because not all duplicate records have same fields to update (cannot have null field pushed to asyncupdate)
let updatePN = [ ];
let updateEM = [ ];
let del = [];
// Use a for loop to iterate over the records
for (let i = 0; i < dupli.length; i++) {
let record = dupli[i];
//traverse duplicate array
for (let j = 0; j < dupli.length; j++) {
let potentialDupe = dupli[j];
let fName = record.getCellValue("First Name") === potentialDupe.getCellValue("First Name");
let lName = record.getCellValue("Last Name") === potentialDupe.getCellValue("Last Name");
let resi = record.getCellValue("Residence") === potentialDupe.getCellValue("Residence");
let ident = record.id !== potentialDupe.id;
//single out records with same first and last name and residence
if (fName && lName && resi && ident) {
// Initialize update variables inside the loop
let upPN = record.getCellValue("Phone Number");
let upEM = record.getCellValue("Email Address");
let upID = record.id;
let flag = false;
if(record.getCellValue("Residence") !== null){
//update phone number array
if (upPN == null && potentialDupe.getCellValue("Phone Number") !== null) {
upPN = potentialDupe.getCellValue("Phone Number");
upID = record.id; // Update this to record.id since you're updating the current record
updatePN.push({id: upID, upPN});
del.push(potentialDupe.id);
flag = true;
}
//update email array
if (upEM == null && potentialDupe.getCellValue("Email Address") !== null) {
upEM = potentialDupe.getCellValue("Email Address");
upID = record.id; // Update this to record.id as well
updateEM.push({id: upID, upEM});
del.push(potentialDupe.id);
flag = true;
}
//add to delete, if no PN and EM OR same PN and EM
if (flag == false){
del.push(potentialDupe.id);
}
}//if residence
}//if true
}//for
}//for
//map phone number array to compatiable format
let upPNmap = updatePN.map (update =>{
return {
"id": update.id,
fields: {
"Phone Number": update.upPN
}
}
});
//update records with phone numbers
while (upPNmap.length > 0 ){
console.log(upPNmap.slice(0,50));
await table.updateRecordsAsync(upPNmap.slice(0,50));
upPNmap = upPNmap.slice(50);
}
//map email array to compatiable format
let upEMmap = updateEM.map (update =>{
return {
"id": update.id,
fields: {
"Email Address": update.upEM
}
}
});
//update records with email
while (upEMmap.length > 0 ){
console.log(upEMmap.slice(0,50));
await table.updateRecordsAsync(upEMmap.slice(0,50));
upEMmap = upEMmap.slice(50);
}
//delete duplicates
console.log(del);
while(del.length > 0){
await table.deleteRecordAsync(del.slice(0,50));
}//while
Feb 23, 2024 06:10 PM
Hello,
I found my own answer.
I sort the database first by name, then iterate through until the value changes. This way I'm not running a O^2 search on a 15k record database.
This is not my full code as it is not all relevant to this question:
var table = base.getTable('All_Voters_No_Duplicates');
var query = await table.selectRecordsAsync({
sorts: [
{field: "Addressee", direction: "asc"}
]});
// Use a for loop to iterate over the records
for (let i = 0; i < query.records.length-1; i++) {
let count = 1;
let record = query.records[i];
let potentialDupe = query.records[i+ count];
let exit = false;
/*
bulk of code
*/
count++;
potentialDupe = query.records[i + count];
if(potentialDupe == undefined){
exit = true;
}//condition to exit while loop
}//for
//remove duplicate entries from update and delete arrays
let toUpPN = uniqByKeepFirst(updatePN, key => key.id)
let toUpEM = uniqByKeepFirst(updateEM, key => key.id)
let toUpDEL = dupeDel(del);
//map phone number array to compatiable format
let upPNmap = toUpPN.map (update =>{
return {
"id": update.id,
fields: {
"Phone Number": update.upPN
}
}
});
//map email array to compatiable format
let upEMmap = toUpEM.map (update =>{
return {
"id": update.id,
fields: {
"Email Address": update.upEM
}
}
});
//update records with phone numbers
while (upPNmap.length > 0 ){
await table.updateRecordsAsync(upPNmap.slice(0,50));
upPNmap = upPNmap.slice(50);
}
//update records with email
while (upEMmap.length > 0 ){
await table.updateRecordsAsync(upEMmap.slice(0,50));
upEMmap = upEMmap.slice(50);
}
//delete duplicates
while(toUpDEL.length > 0){
await table.deleteRecordsAsync(toUpDEL.slice(0,50));
toUpDEL = toUpDEL.slice(50);
}//while
//functions to delete duplicates
//objects in array
function uniqByKeepFirst(a, key) {
let seen = new Set();
return a.filter(item => {
let k = key(item);
return seen.has(k) ? false : seen.add(k);
});
}
//strings in array
function dupeDel(a) {
function onlyUnique(value, index, self) {
return self.indexOf(value) === index;
}
// usage
var unique = a.filter( onlyUnique );
return unique;
}
Feb 24, 2024 08:35 PM
You can test this script, it merge values in each set of matches, puts it in first record and mark others as 'DBL'.
At first try run it with commented write command, because when number of records less than 50k, the computing itself should be a matter of milliseconds. Of course, when you accidentally didn't create a loop inside main loop (as I did last year, when I wrote a 'predecessor' of this script)
I tried it on ~1000 records table - it outputs table just after click 'Run'
const table=base.getTable('Testing_Dupes')
const check=['First name','Last Name','Residence']
const merge=['Phone Number','Email Address']
const query=await table.selectRecordsAsync({fields:[...check,...merge]})
const val=r=>check.map(f=>r.getCellValue(f)).join('')
const valrecs=query.records.reduce((a,v)=>a.set(val(v),[...a.get(val(v))||[],v]),new Map())
const combine=(arr,i)=>merge.map(f=>[f,i? 'DBL':arr.map(r=>r.getCellValue(f)).join('\n')])
const mrg=(arr,ix)=>Object.fromEntries(combine(arr,ix))
const values=[...valrecs.keys()]
const upds=values.flatMap(v=>valrecs.get(v).map((r,ix,arr)=>({id:r.id,fields:mrg(arr,ix)})))
console.log('Values merged, duplicates marked. Writing..'); output.table(upds)
//while (upds.length) await table.updateRecordsAsync(upds.splice(0,50))