aboutsummaryrefslogtreecommitdiffstatshomepage
path: root/tools/build_pkg_index.py
diff options
context:
space:
mode:
Diffstat (limited to 'tools/build_pkg_index.py')
-rw-r--r--tools/build_pkg_index.py116
1 files changed, 116 insertions, 0 deletions
diff --git a/tools/build_pkg_index.py b/tools/build_pkg_index.py
index 883d06c..3bc3cfa 100644
--- a/tools/build_pkg_index.py
+++ b/tools/build_pkg_index.py
@@ -1,5 +1,9 @@
#!/usr/bin/env python
+import os
+import zipfile
+from hashlib import md5
+
"""
Build the collections package index. Usage:
@@ -13,6 +17,27 @@ xml_header = """<?xml version="1.0"?>
"""
+def md5_hexdigest(file):
+ """
+ Calculate and return the MD5 checksum for a given file.
+ ``file`` may either be a filename or an open stream.
+ """
+ if isinstance(file, str):
+ with open(file, "rb") as infile:
+ return _md5_hexdigest(infile)
+ return _md5_hexdigest(file)
+
+
+def _md5_hexdigest(fp):
+ md5_digest = md5()
+ while True:
+ block = fp.read(1024 * 16) # 16k blocks
+ if not block:
+ break
+ md5_digest.update(block)
+ return md5_digest.hexdigest()
+
+
def _indent_xml(xml, prefix=""):
"""
Helper for ``build_index()``: Given an XML ``ElementTree``, modify it
@@ -95,6 +120,97 @@ def build_index(root, base_url):
return top_elt
+def _find_collections(root):
+ """
+ Helper for ``build_index()``: Yield a list of ElementTree.Element
+ objects, each holding the xml for a single package collection.
+ """
+ for dirname, _subdirs, files in os.walk(root):
+ for filename in files:
+ if filename.endswith(".xml"):
+ xmlfile = os.path.join(dirname, filename)
+ yield ElementTree.parse(xmlfile).getroot()
+
+
+def _path_from(parent, child):
+ if os.path.split(parent)[1] == "":
+ parent = os.path.split(parent)[0]
+ path = []
+ while parent != child:
+ child, dirname = os.path.split(child)
+ path.insert(0, dirname)
+ assert os.path.split(child)[0] != child
+ return path
+
+
+def _find_packages(root):
+ """
+ Helper for ``build_index()``: Yield a list of tuples
+ ``(pkg_xml, zf, subdir)``, where:
+ - ``pkg_xml`` is an ``ElementTree.Element`` holding the xml for a
+ package
+ - ``zf`` is a ``zipfile.ZipFile`` for the package's contents.
+ - ``subdir`` is the subdirectory (relative to ``root``) where
+ the package was found (e.g. 'corpora' or 'grammars').
+ """
+ # Find all packages.
+ packages = []
+ for dirname, subdirs, files in os.walk(root):
+ relpath = "/".join(_path_from(root, dirname))
+ for filename in files:
+ if filename.endswith(".xml"):
+ xmlfilename = os.path.join(dirname, filename)
+ zipfilename = xmlfilename[:-4] + ".zip"
+ try:
+ zf = zipfile.ZipFile(zipfilename)
+ except Exception as e:
+ raise ValueError(
+ f"Error reading file {zipfilename!r}!\n{e}") from e
+ try:
+ pkg_xml = ElementTree.parse(xmlfilename).getroot()
+ except Exception as e:
+ raise ValueError(
+ f"Error reading file {xmlfilename!r}!\n{e}") from e
+
+ # Check that the UID matches the filename
+ uid = os.path.split(xmlfilename[:-4])[1]
+ if pkg_xml.get("id") != uid:
+ raise ValueError(
+ "package identifier mismatch (%s "
+ "vs %s)" % (pkg_xml.get("id"), uid)
+ )
+
+ # Check that the zipfile expands to a subdir whose
+ # name matches the uid.
+ if sum(
+ (name != uid and not name.startswith(uid + "/"))
+ for name in zf.namelist()
+ ):
+ raise ValueError(
+ "Zipfile %s.zip does not expand to a "
+ "single subdirectory %s/" % (uid, uid)
+ )
+
+ yield pkg_xml, zf, relpath
+
+ elif filename.endswith(".zip"):
+ # Warn user in case a .xml does not exist for a .zip
+ resourcename = os.path.splitext(filename)[0]
+ xmlfilename = os.path.join(dirname, resourcename + ".xml")
+ if not os.path.exists(xmlfilename):
+ warnings.warn(
+ f"{filename} exists, but {resourcename + '.xml'} cannot be found! "
+ f"This could mean that {resourcename} can not be downloaded.",
+ stacklevel=2,
+ )
+
+ # Don't recurse into svn subdirectories:
+ try:
+ subdirs.remove(".svn")
+ except ValueError:
+ pass
+
+
if len(sys.argv) != 4:
print("Usage: ")
print("build_pkg_index.py <path-to-packages> <base-url> <output-file>")