Help

Re: Exceeding 30 second limit - Script - Delete and Merge

744 0
cancel
Showing results for 
Search instead for 
Did you mean: 
seaTurtlesRUS
4 - Data Explorer
4 - Data Explorer

Hello, 

I have a DB ~15,000 records. I wrote a script to merge and delete records. It compares three data fields for duplications and combines the duplicate records. It also will merge data from two other fields if the fields are not empty. 

I tested my script on a 500 record DB and it works fine. When I test on a 2,500 record DB it times out. 

Can someone make any suggestions on how I can cut the execution time on my script ? 

I've seen posts about using a hash index, but I don't think that would work for what I want to do. Because I need to iterate through all records to find duplicates. If I'm wrong I'd appreciate insight. 

 

 

var table = base.getTable('Testing_Dupes');
var query = await table.selectRecordsAsync();


//find records with a duplicate (first name, last name, residence)
let dupli = query.records.filter((record) =>{
    return query.records.find((potDupe)=>{
        return record.getCellValue("First Name") === potDupe.getCellValue("First Name") && record.id !== potDupe.id && record.getCellValue("Last Name") === potDupe.getCellValue("Last Name") && record.getCellValue("Residence") === potDupe.getCellValue("Residence");
    })
});


//one array for each field with updates because not all duplicate records have same fields to update (cannot have null field pushed to asyncupdate)
let updatePN = [ ];
let updateEM = [ ];
let del = [];

// Use a for loop to iterate over the records
for (let i = 0; i < dupli.length; i++) {
    let record = dupli[i];
//traverse duplicate array 
    for (let j = 0; j < dupli.length; j++) {
        let potentialDupe = dupli[j];
        let fName = record.getCellValue("First Name") === potentialDupe.getCellValue("First Name");
        let lName = record.getCellValue("Last Name") === potentialDupe.getCellValue("Last Name");
        let resi = record.getCellValue("Residence") === potentialDupe.getCellValue("Residence");
        let ident = record.id !== potentialDupe.id;

//single out records with same first and last name and residence 
        if (fName && lName && resi && ident) {
            // Initialize update variables inside the loop
            let upPN = record.getCellValue("Phone Number");
            let upEM = record.getCellValue("Email Address");
            let upID = record.id;
            let flag = false;

            if(record.getCellValue("Residence") !== null){
                //update phone number array
                if (upPN == null && potentialDupe.getCellValue("Phone Number") !== null) {
                    upPN = potentialDupe.getCellValue("Phone Number");
                    upID = record.id; // Update this to record.id since you're updating the current record
                    updatePN.push({id: upID, upPN});
                    del.push(potentialDupe.id);
                    flag = true;
                }
                //update email array 
                if (upEM == null && potentialDupe.getCellValue("Email Address") !== null) {
                    upEM = potentialDupe.getCellValue("Email Address");
                    upID = record.id; // Update this to record.id as well
                    updateEM.push({id: upID, upEM});
                    del.push(potentialDupe.id);
                    flag = true;
                }
                //add to delete, if no PN and EM OR same PN and EM
                if (flag == false){
                    del.push(potentialDupe.id);
                }
            }//if residence 

        }//if true
    }//for 
}//for


//map phone number array to compatiable format 
let upPNmap = updatePN.map (update =>{
    return {
        "id": update.id, 
        fields: {
            "Phone Number": update.upPN
        }
    }
});

//update records with phone numbers 
while (upPNmap.length > 0 ){

console.log(upPNmap.slice(0,50));
    await table.updateRecordsAsync(upPNmap.slice(0,50));
    upPNmap = upPNmap.slice(50);
    
}


//map email array to compatiable format 
let upEMmap = updateEM.map (update =>{
    return {
        "id": update.id, 
        fields: {
            "Email Address": update.upEM
        }
    }
});

//update records with email 
while (upEMmap.length > 0 ){

console.log(upEMmap.slice(0,50));
    await table.updateRecordsAsync(upEMmap.slice(0,50));
    upEMmap = upEMmap.slice(50);
}

//delete duplicates 

console.log(del);


while(del.length > 0){

    await table.deleteRecordAsync(del.slice(0,50));

}//while

 

 

2 Replies 2
seaTurtlesRUS
4 - Data Explorer
4 - Data Explorer

Hello, 

I found my own answer.

I sort the database first by name, then iterate through until the value changes. This way I'm not running a O^2 search on a 15k record database. 

This is not my full code as it is not all relevant to this question: 

var table = base.getTable('All_Voters_No_Duplicates');
var query = await table.selectRecordsAsync({
        sorts: [
        {field: "Addressee", direction: "asc"}
    ]});

// Use a for loop to iterate over the records
for (let i = 0; i < query.records.length-1; i++) {
    let count = 1;
    let record = query.records[i];
    let potentialDupe = query.records[i+ count];
    let exit = false; 
/*
bulk of code
*/
        count++;
        potentialDupe = query.records[i + count];
        if(potentialDupe == undefined){
            exit = true;
        }//condition to exit while loop 
}//for

//remove duplicate entries from update and delete arrays
let toUpPN = uniqByKeepFirst(updatePN, key => key.id)
let toUpEM = uniqByKeepFirst(updateEM, key => key.id)
let toUpDEL = dupeDel(del);

//map phone number array to compatiable format 
let upPNmap = toUpPN.map (update =>{
    return {
        "id": update.id, 
        fields: {
            "Phone Number": update.upPN
        }
    }
});

//map email array to compatiable format 
let upEMmap = toUpEM.map (update =>{
    return {
        "id": update.id, 
        fields: {
            "Email Address": update.upEM
        }
    }
});

//update records with phone numbers 
while (upPNmap.length > 0 ){
    await table.updateRecordsAsync(upPNmap.slice(0,50));
    upPNmap = upPNmap.slice(50);   
}
//update records with email 
while (upEMmap.length > 0 ){
    await table.updateRecordsAsync(upEMmap.slice(0,50));
    upEMmap = upEMmap.slice(50);
}
//delete duplicates
while(toUpDEL.length > 0){
    await table.deleteRecordsAsync(toUpDEL.slice(0,50));
    toUpDEL = toUpDEL.slice(50);
}//while

//functions to delete duplicates 
//objects in array
function uniqByKeepFirst(a, key) {
    let seen = new Set();
    return a.filter(item => {
        let k = key(item);
        return seen.has(k) ? false : seen.add(k);
    });
}
//strings in array
function dupeDel(a) {
  function onlyUnique(value, index, self) { 
      return self.indexOf(value) === index;
  }
  // usage
  var unique = a.filter( onlyUnique ); 
  return unique;
}

 

 

You can test this script, it merge values in each set of matches, puts it in first record and mark others as 'DBL'.
At first try run it with commented write command, because when number of records less than 50k, the computing itself should be a matter of milliseconds. Of course, when you accidentally didn't create a loop inside main loop (as I did last year, when I wrote a 'predecessor' of this script)

I tried it on ~1000 records table - it outputs table just after click 'Run'

const table=base.getTable('Testing_Dupes')
const check=['First name','Last Name','Residence']
const merge=['Phone Number','Email Address'] 
const query=await table.selectRecordsAsync({fields:[...check,...merge]})
const val=r=>check.map(f=>r.getCellValue(f)).join('')
const valrecs=query.records.reduce((a,v)=>a.set(val(v),[...a.get(val(v))||[],v]),new Map())
const combine=(arr,i)=>merge.map(f=>[f,i? 'DBL':arr.map(r=>r.getCellValue(f)).join('\n')])
const mrg=(arr,ix)=>Object.fromEntries(combine(arr,ix))
const values=[...valrecs.keys()]
const upds=values.flatMap(v=>valrecs.get(v).map((r,ix,arr)=>({id:r.id,fields:mrg(arr,ix)})))
console.log('Values merged, duplicates marked. Writing..'); output.table(upds)
//while (upds.length) await table.updateRecordsAsync(upds.splice(0,50))