I... don't have this issue. I do see your example is a substat. Are you outputting the parent stat object?
Why aren't you filtering out duplicates in your program? I'm just doing something simple like:
exists = []
for obj in new_stat_objects:
value = obj.find('fields/field[@name="Name"]').get('value')
if value not in exists:
exists.append(value)
else:
# find duplicate fields, then remove the stat_object parent with the least number of fields
# the assumption is that more fields = the most up-to-date stat_object, so we keep that one
dupes = new_stat_objects.findall('stat_object/fields/field[@name="Name"][@value={}]'.format('"' + value + '"'))
if len(dupes) > 1:
field_counts = []
for dupe in dupes:
parent_obj = dupe.getparent()
field_counts.append(len(parent_obj))
i, v = min(enumerate(field_counts), key=operator.itemgetter(1))
new_stat_objects.remove(dupes[i].getparent().getparent())
Also, are you reindexing the stat objects? (My script processes multiple .stat files and merges same-name .stat files together, so I have to reindex.)
def reindex_output(file_path):
try:
e = etree.parse(os.path.abspath(file_path), XML_PARSER)
except etree.XMLSyntaxError:
raise etree.XMLSyntaxError
root = e.getroot()
i = 0
for stat_object in root.iterfind('stat_objects/stat_object'):
stat_object.set('index', str(i))
i += 1
with open(os.path.abspath(file_path), 'w', encoding='utf-8') as f:
f.write(etree.tostring(root, encoding='unicode', pretty_print=True))