from xml.etree import ElementTree as ET dom = ET.parse("xml-sample-file.xml") rows = dom.findall("row") # Task 1: # Count the users with notifications parameter turned on (sendmenotifications = true) # and place the result into the variable notifications_count (int) notifications_count: int = 0 for element in rows: notifications_count += element.find("sendmenotifications").text == "true" print(f"notifications_count: {notifications_count}") # Task 2: # Search for users with same email addresses and # save their list as a json file email_dupes.json as a dictionary with name: email pairs emails = {} for element in rows: name = element.find("name").text mail = element.find("email").text if mail not in emails: emails[mail] = [name] else: emails[mail].append(name) email_pairs = [] for mail, names in emails.items(): if len(names) > 1: email_pairs.insert(mail) import json with open('email_dupes.json', 'w') as f: json.dump(email_pairs, f) # Search for similar names using SequenceMatcher from difflib # (combinations from itertools might come useful as well). Set similarity score at 85%. # Save a list of names in name_dupes.json file as a list (array) of strings. # Optionally, search for near duplicates using Levenshtein distance. from difflib import SequenceMatcher from itertools import combinations names = set() for element in rows: name = element.find("name").text names.add(name) threshold = 0.85 name_pairs = [] for (str1, str2) in combinations(names, 2): similarity = SequenceMatcher(None, str1, str2).ratio() if similarity >= threshold: name_pairs.append((str1, str2, similarity)) print(name_pairs) with open('name_dupes.json', 'w') as f: json.dump(name_pairs, f, indent=4, sort_keys=True)