summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorNarayan Desai <desai@mcs.anl.gov>2009-11-17 20:36:04 +0000
committerNarayan Desai <desai@mcs.anl.gov>2009-11-17 20:36:04 +0000
commite1ee914acf6f256c1bc2603177900ffccae1bc99 (patch)
treec21fd4f9f6b7b9979fc52519fb79431172d36434
parent6028fc0ee909c781633cef620d475ffedb482aca (diff)
downloadbcfg2-e1ee914acf6f256c1bc2603177900ffccae1bc99.tar.gz
bcfg2-e1ee914acf6f256c1bc2603177900ffccae1bc99.tar.bz2
bcfg2-e1ee914acf6f256c1bc2603177900ffccae1bc99.zip
pkgmgr_gen: use sax to improve memory use (Patch from Gordon Messmer and Ian Dotson)
git-svn-id: https://svn.mcs.anl.gov/repos/bcfg/trunk/bcfg2@5579 ce84e21b-d406-0410-9b95-82705330c041
-rwxr-xr-xtools/pkgmgr_gen.py94
1 files changed, 65 insertions, 29 deletions
diff --git a/tools/pkgmgr_gen.py b/tools/pkgmgr_gen.py
index c62313232..d318e46c0 100755
--- a/tools/pkgmgr_gen.py
+++ b/tools/pkgmgr_gen.py
@@ -20,7 +20,9 @@ import glob
import urllib
import gzip
import urlparse
-from elementtree.ElementTree import parse
+from lxml.etree import parse
+import xml.sax
+from xml.sax.handler import ContentHandler
def info(object, spacing=10, collapse=1):
"""Print methods and doc strings.
@@ -159,6 +161,56 @@ class pkgmgr_URLopener(urllib.FancyURLopener):
"""
print "ERROR %s: Unable to retrieve %s" % (errcode, url)
+class PrimaryParser(ContentHandler):
+ def __init__(self, packages):
+ self.inPackage = 0
+ self.inName = 0
+ self.inArch = 0
+ self.packages = packages
+
+ def startElement(self, name, attrs):
+ if name == "package":
+ self.package = {'file': None, 'name': '', 'subarch': '',
+ 'epoch': None, 'version': None, 'release': None}
+ self.inPackage = 1
+ elif self.inPackage:
+ if name == "name":
+ self.inName = 1
+ elif name == "arch":
+ self.inArch = 1
+ elif name == "version":
+ self.package['epoch'] = attrs.getValue('epoch')
+ self.package['version'] = attrs.getValue('ver')
+ self.package['release'] = attrs.getValue('rel')
+ elif name == "location":
+ self.package['file'] = attrs.getValue('href')
+
+ def endElement(self, name):
+ if name == "package":
+ self.inPackage = 0
+ # Only load RPMs with subarchitectures as calculated from the --archs option.
+ if self.package['subarch'] in subarchs or 'all' in subarchs:
+ self.packages.setdefault(self.package['name'], []).append(
+ {'filename':self.package['file'], 'name':self.package['name'],
+ 'arch':self.package['subarch'], 'epoch':self.package['epoch'],
+ 'version':self.package['version'], 'release':self.package['release']})
+ # Print '.' for each package. stdio is line buffered, so have to flush it.
+ if options.verbose:
+ sys.stdout.write('.')
+ sys.stdout.flush()
+ elif self.inPackage:
+ if name == "name":
+ self.inName = 0
+ elif name == "arch":
+ self.inArch = 0
+
+ def characters(self, content):
+ if self.inPackage:
+ if self.inName:
+ self.package['name'] += content
+ if self.inArch:
+ self.package['subarch'] += content
+
def loadRepos(repolist):
'''
repolist is a list of urls to yum repositories.
@@ -189,6 +241,9 @@ def loadRepos(repolist):
for repo in repolist:
url = urlparse.urljoin(repo, './repodata/repomd.xml')
+ if options.verbose:
+ print 'Loading repo metadata : %s' % url
+
try:
opener = pkgmgr_URLopener()
file, message = opener.retrieve(url)
@@ -203,10 +258,10 @@ def loadRepos(repolist):
repomd = tree.getroot()
for element in repomd:
- if element.tag.endswith('data') and element.attrib['type'] == 'primary':
+ if element.tag.endswith('data') and element.get('type') == 'primary':
for property in element:
if property.tag.endswith('location'):
- primaryhref = property.attrib['href']
+ primaryhref = property.get('href')
url = urlparse.urljoin(repo, './' + primaryhref)
@@ -221,36 +276,14 @@ def loadRepos(repolist):
try:
repo_file = gzip.open(file)
- tree = parse(repo_file)
except IOError:
print "ERROR: Unable to parse retrieved file."
sys.exit()
- repo = tree.getroot()
- for element in repo:
- if element.tag.endswith('package'):
- for property in element:
- if property.tag.endswith('name'):
- name = property.text
- elif property.tag.endswith('arch'):
- subarch = property.text
- elif property.tag.endswith('version'):
- version = property.get('ver')
- epoch = property.get('epoch')
- release = property.get('rel')
- elif property.tag.endswith('location'):
- file = property.get('href')
-
- # Only load RPMs with subarchitectures as calculated from the --archs option.
- if subarch in subarchs or 'all' in subarchs:
- packages.setdefault(name, []).append({'filename':file, 'name':name, \
- 'arch':subarch, 'epoch':epoch, \
- 'version':version, 'release':release})
-
- # Print '.' for each package. stdio is line buffered, so have to flush it.
- if options.verbose:
- sys.stdout.write('.')
- sys.stdout.flush()
+ parser = xml.sax.make_parser()
+ parser.setContentHandler(PrimaryParser(packages))
+ parser.parse(repo_file)
+
if options.verbose:
sys.stdout.write('\n')
repo_file.close()
@@ -294,6 +327,9 @@ def printPackage(entry, group_count):
arch_dict = {}
# Split instances of this package into subarchitectures.
for instance in entry:
+ if instance['arch'] == 'src':
+ continue
+
if instance['arch'] in subarch_dict:
subarch_dict[instance['arch']].append(instance)
else: