I have a program which adds array data to a database. I want to check and see if a record already exists. If the record already exists, then I don't want to add it to the array ( and eventually the database) . I created an if else statement to only add the data to the array if it doesn't already exist in the database. However, when I check to see if the duplicate data is added in the database, I see the duplicate entries added:
test=# select count(*) from cert;
count
-------
100
(1 row)
test=# select count(*) from cert;
count
-------
200
(1 row)
test=#
the second hundred added are all duplicate entries
Here is my Python code where I am attempting to filter these duplicates out:
import json
import json, urllib, requests
import psycopg2
from psycopg2.extras import execute_values
def cleanse(inputData):
out = inputData #data cleanse function
out = out.replace("[", "")
out = out.replace("]", "")
out = out.replace('"', '')
out = out.replace("", "")
out = out.replace(" ", "")
out = out.replace(" ", "")
out = ''.join(out.split())
return out
#connect to the databse
try:
conn = psycopg2.connect("dbname='test' user='postgres' host='localhost' password='test_password' ")
except:
print("Database Connection Failure")
cur = conn.cursor()
#access the API
url = "https://api.certspotter.com/v1/issuances?domain=google.com&include_subdomains=true&expand=dns_names&expand=issuer&expand=cert"
response = urllib.request.urlopen(url)
fulljson = json.loads(response.read())
length = len(fulljson)
issuerSubJson = json.dumps(fulljson[1]["issuer"])
issuerSubJson2 = json.loads(issuerSubJson)
#create 2d array
certgrab_array = [[0 for x in range(11)] for y in range(length)]
#traverse the 2d array and add values from json elements
for i in range(0, length):
row_count = 0
idcheck = cleanse(json.dumps(fulljson[i]["id"]))
cur.execute("SELECT id from cert where id = %s;", [idcheck])
print(idcheck)
row_count = cur.rowcount
if row_count > 0:
exists = True
else:
exists = False
print(exists)
if not exists:
column = 0
#create a sub json variable for the "issuer" sub attributes
issuerSubJson = json.dumps(fulljson[i]["issuer"])
issuerSubJson2 = json.loads(issuerSubJson)
id1 = cleanse(json.dumps(fulljson[i]["id"]))
certgrab_array[i][column] = id1
tbs_sha256 = cleanse(json.dumps(fulljson[i]["tbs_sha256"]))
certgrab_array[i][1] = tbs_sha256
dns = cleanse(json.dumps(fulljson[i]["dns_names"]))
certgrab_array[i][2] = dns
pubkey_sha256 = cleanse(json.dumps(fulljson[i]["pubkey_sha256"]))
certgrab_array[i][3] = pubkey_sha256
j = 4
name = cleanse(json.dumps(issuerSubJson2["name"]))
certgrab_array[i][j] = name
j = j + 1
issuerSha256 = cleanse(json.dumps(issuerSubJson2["pubkey_sha256"]))
certgrab_array[i][j] = issuerSha256
not_before = cleanse(json.dumps(fulljson[i]["not_before"]))
certgrab_array[i][6] = not_before
not_after = cleanse(json.dumps(fulljson[i]["not_after"]))
certgrab_array[i][7] = not_after
p = 8
#Create a sub json variable for the "cert" sub attributes
certSubJson = json.dumps(fulljson[i]["cert"])
certSubJson2 = json.loads(certSubJson)
certType = cleanse(json.dumps(certSubJson2["type"]))
certgrab_array[i][p] = certType
p = p + 1
sha256 = cleanse(json.dumps(certSubJson2["sha256"]))
certgrab_array[i][p] = sha256
p = p + 1
data = cleanse(json.dumps(certSubJson2["data"]))
certgrab_array[i][p] = data
else:
continue
insert_query = "INSERT INTO cert(id, tbs_sha256, dns_names, pubkey_sha256, issuer_name, issuer_pubkey_sha256, not_before, not_after, cert_type, cert_hash, cert_data) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
cur.executemany(insert_query, certgrab_array)
conn.commit()
cur.close()
conn.close()
The exists and idcheck variables get their intended values:
418675152
True
526898343
True
526994364
True
528715730
True
528738608
True
529648183
True
Aucun commentaire:
Enregistrer un commentaire