summaryrefslogtreecommitdiffstats
path: root/src/silfont/etutil.py
diff options
context:
space:
mode:
Diffstat (limited to 'src/silfont/etutil.py')
-rw-r--r--src/silfont/etutil.py270
1 files changed, 270 insertions, 0 deletions
diff --git a/src/silfont/etutil.py b/src/silfont/etutil.py
new file mode 100644
index 0000000..35e5a0a
--- /dev/null
+++ b/src/silfont/etutil.py
@@ -0,0 +1,270 @@
+#!/usr/bin/env python3
+'Classes and functions for handling XML files in pysilfont scripts'
+__url__ = 'https://github.com/silnrsi/pysilfont'
+__copyright__ = 'Copyright (c) 2015 SIL International (https://www.sil.org)'
+__license__ = 'Released under the MIT License (https://opensource.org/licenses/MIT)'
+__author__ = 'David Raymond'
+
+from xml.etree import ElementTree as ET
+import silfont.core
+
+import re, os, codecs, io, collections
+
+_elementprotect = {
+ '&' : '&',
+ '<' : '&lt;',
+ '>' : '&gt;' }
+_attribprotect = dict(_elementprotect)
+_attribprotect['"'] = '&quot;' # Copy of element protect with double quote added
+
+class ETWriter(object) :
+ """ General purpose ElementTree pretty printer complete with options for attribute order
+ beyond simple sorting, and which elements should use cdata
+
+ Note there is no support for namespaces. Originally there was, and if it is needed in the future look at
+ commits from 10th May 2018 or earlier. The code there would need reworking!"""
+
+ def __init__(self, etree, attributeOrder = {}, takesCData = set(),
+ indentIncr = " ", indentFirst = " ", indentML = False, inlineelem=[], precision = None, floatAttribs = [], intAttribs = []):
+ self.root = etree
+ self.attributeOrder = attributeOrder # Sort order for attributes - just one list for all elements
+ self.takesCData = takesCData
+ self.indentIncr = indentIncr # Incremental increase in indent
+ self.indentFirst = indentFirst # Indent for first level
+ self.indentML = indentML # Add indent to multi-line strings
+ self.inlineelem = inlineelem # For supporting in-line elements. Does not work with mix of inline and other subelements in same element
+ self.precision = precision # Precision to use outputting numeric attribute values
+ self.floatAttribs = floatAttribs # List of float/real attributes used with precision
+ self.intAttribs = intAttribs
+
+ def _protect(self, txt, base=_attribprotect) :
+ return re.sub(r'['+r"".join(base.keys())+r"]", lambda m: base[m.group(0)], txt)
+
+ def serialize_xml(self, base = None, indent = '') :
+ # Create the xml and return as a string
+ outstrings = []
+ outstr=""
+ if base is None :
+ base = self.root
+ outstr += '<?xml version="1.0" encoding="UTF-8"?>\n'
+ if '.pi' in base.attrib : # Processing instructions
+ for pi in base.attrib['.pi'].split(",") : outstr += '<?{}?>\n'.format(pi)
+
+ if '.doctype' in base.attrib : outstr += '<!DOCTYPE {}>\n'.format(base.attrib['.doctype'])
+
+ tag = base.tag
+ attribs = base.attrib
+
+ if '.comments' in attribs :
+ for c in attribs['.comments'].split(",") : outstr += '{}<!--{}-->\n'.format(indent, c)
+
+ i = indent if tag not in self.inlineelem else ""
+ outstr += '{}<{}'.format(i, tag)
+
+ for k in sorted(list(attribs.keys()), key=lambda x: self.attributeOrder.get(x, x)):
+ if k[0] != '.' :
+ att = attribs[k]
+ if self.precision is not None and k in self.floatAttribs :
+ if "." in att:
+ num = round(float(att), self.precision)
+ att = int(num) if num == int(num) else num
+ elif k in self.intAttribs :
+ att = int(round(float(att)))
+ else:
+ att = self._protect(att)
+ outstr += ' {}="{}"'.format(k, att)
+
+ if len(base) or (base.text and base.text.strip()) :
+ outstr += '>'
+ if base.text and base.text.strip() :
+ if tag not in self.takesCData :
+ t = base.text
+ if self.indentML : t = t.replace('\n', '\n' + indent)
+ t = self._protect(t, base=_elementprotect)
+ else :
+ t = "<![CDATA[\n\t" + indent + base.text.replace('\n', '\n\t' + indent) + "\n" + indent + "]]>"
+ outstr += t
+ if len(base) :
+ if base[0].tag not in self.inlineelem : outstr += '\n'
+ if base == self.root:
+ incr = self.indentFirst
+ else:
+ incr = self.indentIncr
+ outstrings.append(outstr); outstr=""
+ for b in base : outstrings.append(self.serialize_xml(base=b, indent=indent + incr))
+ if base[-1].tag not in self.inlineelem : outstr += indent
+ outstr += '</{}>'.format(tag)
+ else :
+ outstr += '/>'
+ if base.tail and base.tail.strip() :
+ outstr += self._protect(base.tail, base=_elementprotect)
+ if tag not in self.inlineelem : outstr += "\n"
+
+ if '.commentsafter' in base.attrib :
+ for c in base.attrib['.commentsafter'].split(",") : outstr += '{}<!--{}-->\n'.format(indent, c)
+
+ outstrings.append(outstr)
+ return "".join(outstrings)
+
+class _container(object) :
+ # Parent class for other objects
+ def __init_(self) :
+ self._contents = {}
+ # Define methods so it acts like an imutable container
+ # (changes should be made via object functions etc)
+ def __len__(self):
+ return len(self._contents)
+ def __getitem__(self, key):
+ return self._contents[key]
+ def __iter__(self):
+ return iter(self._contents)
+ def keys(self) :
+ return self._contents.keys()
+
+class xmlitem(_container):
+ """ The xml data item for an xml file"""
+
+ def __init__(self, dirn = None, filen = None, parse = True, logger=None) :
+ self.logger = logger if logger else silfont.core.loggerobj()
+ self._contents = {}
+ self.dirn = dirn
+ self.filen = filen
+ self.inxmlstr = ""
+ self.outxmlstr = ""
+ self.etree = None
+ self.type = None
+ if filen and dirn :
+ fulln = os.path.join( dirn, filen)
+ self.inxmlstr = io.open(fulln, "rt", encoding="utf-8").read()
+ if parse :
+ try:
+ self.etree = ET.fromstring(self.inxmlstr)
+ except:
+ try:
+ self.etree = ET.fromstring(self.inxmlstr.encode("utf-8"))
+ except Exception as e:
+ self.logger.log("Failed to parse xml for " + fulln, "E")
+ self.logger.log(str(e), "S")
+
+ def write_to_file(self,dirn,filen) :
+ outfile = io.open(os.path.join(dirn,filen),'w', encoding="utf-8")
+ outfile.write(self.outxmlstr)
+
+class ETelement(_container):
+ # Class for an etree element. Mainly used as a parent class
+ # For each tag in the element, ETelement[tag] returns a list of sub-elements with that tag
+ # process_subelements can set attributes for each tag based on a supplied spec
+ def __init__(self,element) :
+ self.element = element
+ self._contents = {}
+ self.reindex()
+
+ def reindex(self) :
+ self._contents = collections.defaultdict(list)
+ for e in self.element :
+ self._contents[e.tag].append(e)
+
+ def remove(self,subelement) :
+ self._contents[subelement.tag].remove(subelement)
+ self.element.remove(subelement)
+
+ def append(self,subelement) :
+ self._contents[subelement.tag].append(subelement)
+ self.element.append(subelement)
+
+ def insert(self,index,subelement) :
+ self._contents[subelement.tag].insert(index,subelement)
+ self.element.insert(index,subelement)
+
+ def replace(self,index,subelement) :
+ self._contents[subelement.tag][index] = subelement
+ self.element[index] = subelement
+
+ def process_attributes(self, attrspec, others = False) :
+ # Process attributes based on list of attributes in the format:
+ # (element attr name, object attr name, required)
+ # If attr does not exist and is not required, set to None
+ # If others is True, attributes not in the list are allowed
+ # Attributes should be listed in the order they should be output if writing xml out
+
+ if not hasattr(self,"parseerrors") or self.parseerrors is None: self.parseerrors=[]
+
+ speclist = {}
+ for (i,spec) in enumerate(attrspec) : speclist[spec[0]] = attrspec[i]
+
+ for eaname in speclist :
+ (eaname,oaname,req) = speclist[eaname]
+ setattr(self, oaname, getattrib(self.element,eaname))
+ if req and getattr(self, oaname) is None : self.parseerrors.append("Required attribute " + eaname + " missing")
+
+ # check for any other attributes
+ for att in self.element.attrib :
+ if att not in speclist :
+ if others:
+ setattr(self, att, getattrib(self.element,att))
+ else :
+ self.parseerrors.append("Invalid attribute " + att)
+
+ def process_subelements(self,subspec, offspec = False) :
+ # Process all subelements based on spec of expected elements
+ # subspec is a list of elements, with each list in the format:
+ # (element name, attribute name, class name, required, multiple valeus allowed)
+ # If cl is set, attribute is set to an object made with that class; otherwise just text of the element
+
+ if not hasattr(self,"parseerrors") or self.parseerrors is None : self.parseerrors=[]
+
+ def make_obj(self,cl,element) : # Create object from element and cascade parse errors down
+ if cl is None : return element.text
+ if cl is ETelement :
+ obj = cl(element) # ETelement does not require parent object, ie self
+ else :
+ obj = cl(self,element)
+ if hasattr(obj,"parseerrors") and obj.parseerrors != [] :
+ if hasattr(obj,"name") and obj.name is not None : # Try to find a name for error reporting
+ name = obj.name
+ elif hasattr(obj,"label") and obj.label is not None :
+ name = obj.label
+ else :
+ name = ""
+
+ self.parseerrors.append("Errors parsing " + element.tag + " element: " + name)
+ for error in obj.parseerrors :
+ self.parseerrors.append(" " + error)
+ return obj
+
+ speclist = {}
+ for (i,spec) in enumerate(subspec) : speclist[spec[0]] = subspec[i]
+
+ for ename in speclist :
+ (ename,aname,cl,req,multi) = speclist[ename]
+ initval = [] if multi else None
+ setattr(self,aname,initval)
+
+ for ename in self : # Process all elements
+ if ename in speclist :
+ (ename,aname,cl,req,multi) = speclist[ename]
+ elements = self[ename]
+ if multi :
+ for elem in elements : getattr(self,aname).append(make_obj(self,cl,elem))
+ else :
+ setattr(self,aname,make_obj(self,cl,elements[0]))
+ if len(elements) > 1 : self.parseerrors.append("Multiple " + ename + " elements not allowed")
+ else:
+ if offspec: # Elements not in spec are allowed so create list of sub-elemente.
+ setattr(self,ename,[])
+ for elem in elements : getattr(self,ename).append(ETelement(elem))
+ else :
+ self.parseerrors.append("Invalid element: " + ename)
+
+ for ename in speclist : # Check values exist for required elements etc
+ (ename,aname,cl,req,multi) = speclist[ename]
+
+ val = getattr(self,aname)
+ if req :
+ if multi and val == [] : self.parseerrors.append("No " + ename + " elements ")
+ if not multi and val == None : self.parseerrors.append("No " + ename + " element")
+
+def makeAttribOrder(attriblist) : # Turn a list of attrib names into an attributeOrder dict for ETWriter
+ return dict(map(lambda x:(x[1], x[0]), enumerate(attriblist)))
+
+def getattrib(element,attrib) : return element.attrib[attrib] if attrib in element.attrib else None