diff options
Diffstat (limited to 'tools/build_pkg_index.py')
| -rw-r--r-- | tools/build_pkg_index.py | 116 |
1 files changed, 116 insertions, 0 deletions
diff --git a/tools/build_pkg_index.py b/tools/build_pkg_index.py index 883d06c..3bc3cfa 100644 --- a/tools/build_pkg_index.py +++ b/tools/build_pkg_index.py @@ -1,5 +1,9 @@ #!/usr/bin/env python +import os +import zipfile +from hashlib import md5 + """ Build the collections package index. Usage: @@ -13,6 +17,27 @@ xml_header = """<?xml version="1.0"?> """ +def md5_hexdigest(file): + """ + Calculate and return the MD5 checksum for a given file. + ``file`` may either be a filename or an open stream. + """ + if isinstance(file, str): + with open(file, "rb") as infile: + return _md5_hexdigest(infile) + return _md5_hexdigest(file) + + +def _md5_hexdigest(fp): + md5_digest = md5() + while True: + block = fp.read(1024 * 16) # 16k blocks + if not block: + break + md5_digest.update(block) + return md5_digest.hexdigest() + + def _indent_xml(xml, prefix=""): """ Helper for ``build_index()``: Given an XML ``ElementTree``, modify it @@ -95,6 +120,97 @@ def build_index(root, base_url): return top_elt +def _find_collections(root): + """ + Helper for ``build_index()``: Yield a list of ElementTree.Element + objects, each holding the xml for a single package collection. + """ + for dirname, _subdirs, files in os.walk(root): + for filename in files: + if filename.endswith(".xml"): + xmlfile = os.path.join(dirname, filename) + yield ElementTree.parse(xmlfile).getroot() + + +def _path_from(parent, child): + if os.path.split(parent)[1] == "": + parent = os.path.split(parent)[0] + path = [] + while parent != child: + child, dirname = os.path.split(child) + path.insert(0, dirname) + assert os.path.split(child)[0] != child + return path + + +def _find_packages(root): + """ + Helper for ``build_index()``: Yield a list of tuples + ``(pkg_xml, zf, subdir)``, where: + - ``pkg_xml`` is an ``ElementTree.Element`` holding the xml for a + package + - ``zf`` is a ``zipfile.ZipFile`` for the package's contents. + - ``subdir`` is the subdirectory (relative to ``root``) where + the package was found (e.g. 'corpora' or 'grammars'). + """ + # Find all packages. + packages = [] + for dirname, subdirs, files in os.walk(root): + relpath = "/".join(_path_from(root, dirname)) + for filename in files: + if filename.endswith(".xml"): + xmlfilename = os.path.join(dirname, filename) + zipfilename = xmlfilename[:-4] + ".zip" + try: + zf = zipfile.ZipFile(zipfilename) + except Exception as e: + raise ValueError( + f"Error reading file {zipfilename!r}!\n{e}") from e + try: + pkg_xml = ElementTree.parse(xmlfilename).getroot() + except Exception as e: + raise ValueError( + f"Error reading file {xmlfilename!r}!\n{e}") from e + + # Check that the UID matches the filename + uid = os.path.split(xmlfilename[:-4])[1] + if pkg_xml.get("id") != uid: + raise ValueError( + "package identifier mismatch (%s " + "vs %s)" % (pkg_xml.get("id"), uid) + ) + + # Check that the zipfile expands to a subdir whose + # name matches the uid. + if sum( + (name != uid and not name.startswith(uid + "/")) + for name in zf.namelist() + ): + raise ValueError( + "Zipfile %s.zip does not expand to a " + "single subdirectory %s/" % (uid, uid) + ) + + yield pkg_xml, zf, relpath + + elif filename.endswith(".zip"): + # Warn user in case a .xml does not exist for a .zip + resourcename = os.path.splitext(filename)[0] + xmlfilename = os.path.join(dirname, resourcename + ".xml") + if not os.path.exists(xmlfilename): + warnings.warn( + f"{filename} exists, but {resourcename + '.xml'} cannot be found! " + f"This could mean that {resourcename} can not be downloaded.", + stacklevel=2, + ) + + # Don't recurse into svn subdirectories: + try: + subdirs.remove(".svn") + except ValueError: + pass + + if len(sys.argv) != 4: print("Usage: ") print("build_pkg_index.py <path-to-packages> <base-url> <output-file>") |
