TalTech_python/teams_task1.py

from xml.etree import ElementTree as ET

dom = ET.parse("xml-sample-file.xml")
rows = dom.findall("row")

# Task 1:
# Count the users with notifications parameter turned on (sendmenotifications = true)
# and place the result into the variable notifications_count (int)
notifications_count: int = 0
for element in rows:
    notifications_count += element.find("sendmenotifications").text == "true"

print(f"notifications_count: {notifications_count}")


# Task 2:
# Search for users with same email addresses and
# save their list as a json file email_dupes.json as a dictionary with name: email pairs
emails = {}
for element in rows:
    name = element.find("name").text
    mail = element.find("email").text
    if mail not in emails:
        emails[mail] = [name]
    else:
        emails[mail].append(name)

email_pairs = []
for mail, names in emails.items():
    if len(names) > 1:
        email_pairs.insert(mail)

import json
with open('email_dupes.json', 'w') as f:
    json.dump(email_pairs, f)

# Search for similar names using SequenceMatcher from difflib
# (combinations from itertools might come useful as well). Set similarity score at 85%.
# Save a list of names in name_dupes.json file as a list (array) of strings.
# Optionally, search for near duplicates using Levenshtein distance.
from difflib import SequenceMatcher
from itertools import combinations

names = set()
for element in rows:
    name = element.find("name").text
    names.add(name)

threshold = 0.85
name_pairs = []
for (str1, str2) in combinations(names, 2):
    similarity = SequenceMatcher(None, str1, str2).ratio()
    if similarity >= threshold:
        name_pairs.append((str1, str2, similarity))

with open('name_dupes.json', 'w') as f:
    json.dump(name_pairs, f, indent=4, sort_keys=True)