Return-Path: X-Original-To: apmail-community-commits-archive@minotaur.apache.org Delivered-To: apmail-community-commits-archive@minotaur.apache.org Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by minotaur.apache.org (Postfix) with SMTP id CC8F318003 for ; Sun, 12 Jul 2015 22:58:51 +0000 (UTC) Received: (qmail 85190 invoked by uid 500); 12 Jul 2015 22:58:51 -0000 Delivered-To: apmail-community-commits-archive@community.apache.org Received: (qmail 85164 invoked by uid 500); 12 Jul 2015 22:58:51 -0000 Mailing-List: contact commits-help@community.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@community.apache.org Delivered-To: mailing list commits@community.apache.org Received: (qmail 85155 invoked by uid 99); 12 Jul 2015 22:58:51 -0000 Received: from eris.apache.org (HELO hades.apache.org) (140.211.11.105) by apache.org (qpsmtpd/0.29) with ESMTP; Sun, 12 Jul 2015 22:58:51 +0000 Received: from hades.apache.org (localhost [127.0.0.1]) by hades.apache.org (ASF Mail Server at hades.apache.org) with ESMTP id 69F60AC0113 for ; Sun, 12 Jul 2015 22:58:51 +0000 (UTC) Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: svn commit: r1690547 - in /comdev/projects.apache.org/scripts/import: parsecommittees.py parseprojects.py Date: Sun, 12 Jul 2015 22:58:51 -0000 To: commits@community.apache.org From: sebb@apache.org X-Mailer: svnmailer-1.0.9 Message-Id: <20150712225851.69F60AC0113@hades.apache.org> Author: sebb Date: Sun Jul 12 22:58:51 2015 New Revision: 1690547 URL: http://svn.apache.org/r1690547 Log: EOL Modified: comdev/projects.apache.org/scripts/import/parsecommittees.py (contents, props changed) comdev/projects.apache.org/scripts/import/parseprojects.py (contents, props changed) Modified: comdev/projects.apache.org/scripts/import/parsecommittees.py URL: http://svn.apache.org/viewvc/comdev/projects.apache.org/scripts/import/parsecommittees.py?rev=1690547&r1=1690546&r2=1690547&view=diff ============================================================================== --- comdev/projects.apache.org/scripts/import/parsecommittees.py (original) +++ comdev/projects.apache.org/scripts/import/parsecommittees.py Sun Jul 12 22:58:51 2015 @@ -1,342 +1,342 @@ -import re -import json -import sys -import io -import os -import urllib.request -import xml.etree.ElementTree as ET -import xml.dom.minidom as minidom -import datetime - -""" -Reads: -../../site/json/foundation/people.json -../../data/committees.xml -../../data/board/committee-info.txt -../../site/json/foundation/chairs.json (cross-check) - -Updates: -../../site/json/foundation/committees.json -../../site/json/foundation/committees-retired.json - -Writes: -../../site/json/foundation/pmcs.json -""" - -# Committee names from committees-info.txt that do not match committees-evolution.json -renamesCommittee2Json = { - 'Apache APR': 'Apache Portable Runtime', - 'Apache Perl': 'Apache mod_perl' -} -# Committee names from http://www.apache.org/foundation/ that do not match committees-evolution.json -renamesChairs2Json = { - 'Apache Logging Services': 'Apache Logging', - 'Apache Perl': 'Apache mod_perl' -} -# committee ids not matching committee name in lowercase -committeeIds = { - 'Community Development': 'comdev', - 'HTTP Server': 'httpd', - 'Lucene.Net': 'lucenenet', - 'Open Climate Workbench': 'climate' -} -# LDAP group ids not matching committee id -group_ids = { - 'webservices': 'ws' -} -# homepages not matching http://.apache.org/ -homepages = { - 'comdev': 'http://community.apache.org/', - 'httpcomponents': 'http://hc.apache.org/', - 'whimsy': 'http://whimsical.apache.org' -} -# short description for non-classical committees, that are not listed in http://www.apache.org/#projects-list -shortdescs = { - 'attic': 'A home for dormant projects', - 'comdev': 'Ressources to help people become involved with Apache projects', - 'incubator': "Entry path into The Apache Software Foundation (ASF) for projects and codebases wishing to become part of the Foundation's efforts", - 'labs': 'A place for innovation where committers of the foundation can experiment with new ideas' -} - -with open("../../site/json/foundation/people.json", "r") as f: - people = json.loads(f.read()) - f.close() - -def handleChild(el): - retval = None - hasKids = False - for child in list(el): - hasKids = True - attribs = {} - for key in el.attrib: - xkey = re.sub(r"\{.+\}", "", key) - attribs[xkey] = el.attrib[key] - tag = re.sub(r"\{.+\}", "", el.tag) - value = attribs['resource'] if 'resource' in attribs else el.text - if not hasKids: - retval = value - else: - retval = {} - for child in list(el): - k, v = handleChild(child) - retval[k] = v - return tag, retval - -# get PMC Data from /data/committees.xml -print("reading PMC Data (/data/committees.xml)") -pmcs = {} -pmcDataUrls = {} # id -> url -with open("../../data/committees.xml", "r") as f: - xmldoc = minidom.parseString(f.read()) - f.close() -itemlist = xmldoc.getElementsByTagName('location') -for s in itemlist : - url = s.childNodes[0].data - try: - if url.startswith('http'): - print(url) - rdf = urllib.request.urlopen(url).read() - else: - rdf = open("../../data/%s" % url, 'r').read() - url = "https://svn.apache.org/repos/asf/comdev/projects.apache.org/data/%s" % url - rdfxml = ET.fromstring(rdf) - data = rdfxml[0] - committeeId = data.attrib['{http://www.w3.org/1999/02/22-rdf-syntax-ns#}about'] - pmcDataUrls[committeeId] = url - - # transform PMC data RDF to json - pmcjson = { - 'rdf': url - } - pmcname = None - for el in data: - k, v = handleChild(el) - if k in pmcjson: - # merge multiple values - if type(pmcjson[k]) is str: - pmcjson[k] = "%s, %s" % (pmcjson[k], v) - else: - for xk in v: - pmcjson[k][xk] = v[xk] - else: - pmcjson[k] = v - - pmcs[committeeId] = pmcjson - - # copy PMC RDF data to /doap/{committeeId}/pmc-doap.rdf - if type(rdf) is str: - mode = "w" - else: - mode = "wb" - with open("../../site/doap/%s/pmc-doap.rdf" % committeeId, mode) as f: - f.write(rdf) - f.close() - - except Exception as err: - print(err) - -with open("../../data/board/committee-info.txt", "rb") as f: - data = f.read().decode('utf-8') - f.close() - -# extract reporting cycles information -cycles = { 'Incubator': 0 } # Incubator reports each month -current = 1 -reports = data[data.index("January, April, July, October"):data.index('Next month')] -buf = io.StringIO(reports) -newCycle = True -for p in buf.readlines(): - if p.startswith(' '): - cycles[p.strip()] = current - newCycle = False - elif len(p.strip()) == 0: - if not newCycle: - newCycle = True - current += 1 -# extract committees composition -data = data[data.index('Hint: '):] -data = data[data.index('* '):] -buf = io.StringIO(data) -curCommittee = '' -committees = {} -c = {} -newCommittee = True -for l in buf.readlines(): - if l.startswith('* '): - curCommittee = l[2:l.index(' (')] - newCommittee = True - c['members'] = {} - elif len(l.strip()) == 0: - if newCommittee: - committees[curCommittee] = c - c = {} - newCommittee = False - elif not l.startswith('==='): - m = re.search(r"(.+?)\s+<([^@]+)@apache.org", l.strip()) - if not m: - print("unexpected line format: %s" % l.strip()) - fullname = m.group(1) - uid = m.group(2) - isChair = fullname.endswith('(chair)') - if isChair: - fullname = fullname[0:fullname.index('(')].strip() - c['chair'] = uid - c['members'][uid] = fullname - -# This only appears to be used for checking links -www = urllib.request.urlopen("http://www.apache.org/").read().decode('utf-8') - -committeeCount = 0 -committeesList = [] -committeesMap = {} -addedCommittees = [] -c = {} - -for pmc in re.findall(r"\* .+?\s+\(est\. [0-9/]+[^\r\n]+", data): - - #print(pmc) - m = re.search(r"\* (.+?)\s+\(est. ([0-9]+)/([0-9]+)", pmc) - if m: - committeeShortName = m.group(1) - month = m.group(2) - year = m.group(3) - if not re.search(r"Committee", pmc): - if committeeShortName in committeeIds: - committeeId = committeeIds[committeeShortName] - else: - committeeId = committeeShortName.lower().replace(' ', '').replace('.', '') - # Classical committee - committeeName = "Apache %s" % committeeShortName - if committeeName in renamesCommittee2Json: - committeeName = renamesCommittee2Json[committeeName] - #print(committeeShortName) - committeeCount += 1 - - # add committee to committees - committee = {} - committee['id'] = committeeId - if committeeId in group_ids: - group = group_ids[committeeId] - else: - group = committeeId - committee['group'] = group - committee['name'] = committeeName - committee['established'] = "%s-%s" % (year, month) - if group in homepages: - homepage = homepages[group] - else: - homepage = 'http://%s.apache.org/' % group - committee['homepage'] = homepage - # committee committers and PMC members - pmcgroup = "%s-pmc" % group - committers = [] # [ 'login' ] - pmc = [] # [ 'login' ] - for login in people: - p = people[login] - if p['groups']: - if group in p['groups']: - committers.append(login) - if pmcgroup in p['groups']: - pmc.append(login) - else: - print("user %s has no groups" % login) - committers.sort() - pmc.sort() - # don't store committers and PMC members arrays in committee: it's easy to get from groups.json - #committee['pmcs'] = pmc - #committee['committers'] = committers - if len(pmc) == 0: - print('WARN: %s (%s established in %s) has no PMC members LDAP group (id=%s)' % (committeeId, committeeName, committee['established'], pmcgroup)) - if committeeShortName in committees: - committee['chair'] = committees[committeeShortName]['chair'] - if committeeShortName in cycles: - committee['reporting'] = cycles[committeeShortName] - else: - print('WARN: %s not found in reporting cycles' % committeeShortName) - - link = '')] - committee['shortdesc'] = shortdesc - else: - print("WARN: %s (%s) missing from http://www.apache.org/#projects-list" % (committeeShortName, homepage)) - # TODO committee['description'] (or charter) not in committee-info.txt - # TODO committee['retired'] not in committee-info.txt - if committeeId in pmcDataUrls: - committee['rdf'] = pmcDataUrls[committeeId] - else: - print("WARN: %s (%s) missing from committees.xml" % (committeeShortName, committeeId)) - committeesList.append(committee) - committeesMap[committeeId] = committee; - - # generate TLP PMC DOAP file at http://projects-new.apache.org/doap/{committeeId}/pmc.rdf - doap = ET.Element('rdf:RDF', attrib= { 'xml:lang': 'en', - 'xmlns': 'http://usefulinc.com/ns/doap#', - 'xmlns:rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#', - 'xmlns:asfext': 'http://projects.apache.org/ns/asfext#', - 'xmlns:foaf': 'http://xmlns.com/foaf/0.1/' - }) - doap_pmc = ET.SubElement(doap, 'asfext:pmc') - ET.SubElement(doap_pmc, 'asfext:name').text = committeeName - ET.SubElement(doap_pmc, 'homepage', attrib = { 'rdf:resource': homepage }) - doap_chair = ET.SubElement(doap_pmc, 'asfext:chair') - doap_chair_person = ET.SubElement(doap_chair, 'foaf:Person') - ET.SubElement(doap_chair_person, 'foaf:nick').text = committee['chair'] - ET.SubElement(doap_chair_person, 'foaf:name').text = people[committee['chair']]['name'] - directory = "../../site/doap/%s" % committeeId - if not os.path.exists(directory): - os.makedirs(directory) - with open("%s/pmc.rdf" % directory, "w") as f: - f.write(minidom.parseString(ET.tostring(doap, encoding="utf-8")).toprettyxml(indent="\t")) - f.close() - - else: - # Special Committee (Officer's, President's or Board) - print("INFO: %s ignored %s" % (committeeShortName, pmc[pmc.rfind('('):])) - c[committeeName] = True - -print("found %s new committees from %s committees in committee_info.txt" % (len(addedCommittees), committeeCount)) -addedCommittees.sort() -for added in addedCommittees: - print("- %s" % added) - -# detect retired committees to add to committees-retired.json -with open("../../site/json/foundation/committees.json", "r") as f: - committeesPrevious = json.loads(f.read()) - f.close() -with open("../../site/json/foundation/committees-retired.json", "r") as f: - committeesRetired = json.loads(f.read()) - f.close() -for previous in committeesPrevious: - if not previous['id'] in committeesMap: - print("found retired committee: %s" % previous['name']) - previous['retired'] = datetime.date.today().strftime('%Y-%m') - # remove data that is not useful in a retired committee - previous.pop('chair', None) - previous.pop('group', None) - previous.pop('rdf', None) - previous.pop('reporting', None) - committeesRetired.append(previous) - -with open("../../site/json/foundation/committees.json", "w") as f: - f.write(json.dumps(committeesList, sort_keys=True, indent=0)) - f.close() - -with open("../../site/json/foundation/committees-retired.json", "w") as f: - f.write(json.dumps(committeesRetired, sort_keys=True, indent=0)) - f.close() - -with open ("../../site/json/foundation/pmcs.json", "w") as f: - f.write(json.dumps(pmcs, sort_keys=True, indent=0)) - f.close() - -# compare with chairs, for consistency checking -chairs = json.load(open("../../site/json/foundation/chairs.json")) -for chair in chairs: - if chair in renamesChairs2Json: - chair = renamesChairs2Json[chair] - if not chair in c: - print("WARN: %s is in http://www.apache.org/foundation/ but not in committee-info.txt: typo somewhere or retirement in progress?" % chair) +import re +import json +import sys +import io +import os +import urllib.request +import xml.etree.ElementTree as ET +import xml.dom.minidom as minidom +import datetime + +""" +Reads: +../../site/json/foundation/people.json +../../data/committees.xml +../../data/board/committee-info.txt +../../site/json/foundation/chairs.json (cross-check) + +Updates: +../../site/json/foundation/committees.json +../../site/json/foundation/committees-retired.json + +Writes: +../../site/json/foundation/pmcs.json +""" + +# Committee names from committees-info.txt that do not match committees-evolution.json +renamesCommittee2Json = { + 'Apache APR': 'Apache Portable Runtime', + 'Apache Perl': 'Apache mod_perl' +} +# Committee names from http://www.apache.org/foundation/ that do not match committees-evolution.json +renamesChairs2Json = { + 'Apache Logging Services': 'Apache Logging', + 'Apache Perl': 'Apache mod_perl' +} +# committee ids not matching committee name in lowercase +committeeIds = { + 'Community Development': 'comdev', + 'HTTP Server': 'httpd', + 'Lucene.Net': 'lucenenet', + 'Open Climate Workbench': 'climate' +} +# LDAP group ids not matching committee id +group_ids = { + 'webservices': 'ws' +} +# homepages not matching http://.apache.org/ +homepages = { + 'comdev': 'http://community.apache.org/', + 'httpcomponents': 'http://hc.apache.org/', + 'whimsy': 'http://whimsical.apache.org' +} +# short description for non-classical committees, that are not listed in http://www.apache.org/#projects-list +shortdescs = { + 'attic': 'A home for dormant projects', + 'comdev': 'Ressources to help people become involved with Apache projects', + 'incubator': "Entry path into The Apache Software Foundation (ASF) for projects and codebases wishing to become part of the Foundation's efforts", + 'labs': 'A place for innovation where committers of the foundation can experiment with new ideas' +} + +with open("../../site/json/foundation/people.json", "r") as f: + people = json.loads(f.read()) + f.close() + +def handleChild(el): + retval = None + hasKids = False + for child in list(el): + hasKids = True + attribs = {} + for key in el.attrib: + xkey = re.sub(r"\{.+\}", "", key) + attribs[xkey] = el.attrib[key] + tag = re.sub(r"\{.+\}", "", el.tag) + value = attribs['resource'] if 'resource' in attribs else el.text + if not hasKids: + retval = value + else: + retval = {} + for child in list(el): + k, v = handleChild(child) + retval[k] = v + return tag, retval + +# get PMC Data from /data/committees.xml +print("reading PMC Data (/data/committees.xml)") +pmcs = {} +pmcDataUrls = {} # id -> url +with open("../../data/committees.xml", "r") as f: + xmldoc = minidom.parseString(f.read()) + f.close() +itemlist = xmldoc.getElementsByTagName('location') +for s in itemlist : + url = s.childNodes[0].data + try: + if url.startswith('http'): + print(url) + rdf = urllib.request.urlopen(url).read() + else: + rdf = open("../../data/%s" % url, 'r').read() + url = "https://svn.apache.org/repos/asf/comdev/projects.apache.org/data/%s" % url + rdfxml = ET.fromstring(rdf) + data = rdfxml[0] + committeeId = data.attrib['{http://www.w3.org/1999/02/22-rdf-syntax-ns#}about'] + pmcDataUrls[committeeId] = url + + # transform PMC data RDF to json + pmcjson = { + 'rdf': url + } + pmcname = None + for el in data: + k, v = handleChild(el) + if k in pmcjson: + # merge multiple values + if type(pmcjson[k]) is str: + pmcjson[k] = "%s, %s" % (pmcjson[k], v) + else: + for xk in v: + pmcjson[k][xk] = v[xk] + else: + pmcjson[k] = v + + pmcs[committeeId] = pmcjson + + # copy PMC RDF data to /doap/{committeeId}/pmc-doap.rdf + if type(rdf) is str: + mode = "w" + else: + mode = "wb" + with open("../../site/doap/%s/pmc-doap.rdf" % committeeId, mode) as f: + f.write(rdf) + f.close() + + except Exception as err: + print(err) + +with open("../../data/board/committee-info.txt", "rb") as f: + data = f.read().decode('utf-8') + f.close() + +# extract reporting cycles information +cycles = { 'Incubator': 0 } # Incubator reports each month +current = 1 +reports = data[data.index("January, April, July, October"):data.index('Next month')] +buf = io.StringIO(reports) +newCycle = True +for p in buf.readlines(): + if p.startswith(' '): + cycles[p.strip()] = current + newCycle = False + elif len(p.strip()) == 0: + if not newCycle: + newCycle = True + current += 1 +# extract committees composition +data = data[data.index('Hint: '):] +data = data[data.index('* '):] +buf = io.StringIO(data) +curCommittee = '' +committees = {} +c = {} +newCommittee = True +for l in buf.readlines(): + if l.startswith('* '): + curCommittee = l[2:l.index(' (')] + newCommittee = True + c['members'] = {} + elif len(l.strip()) == 0: + if newCommittee: + committees[curCommittee] = c + c = {} + newCommittee = False + elif not l.startswith('==='): + m = re.search(r"(.+?)\s+<([^@]+)@apache.org", l.strip()) + if not m: + print("unexpected line format: %s" % l.strip()) + fullname = m.group(1) + uid = m.group(2) + isChair = fullname.endswith('(chair)') + if isChair: + fullname = fullname[0:fullname.index('(')].strip() + c['chair'] = uid + c['members'][uid] = fullname + +# This only appears to be used for checking links +www = urllib.request.urlopen("http://www.apache.org/").read().decode('utf-8') + +committeeCount = 0 +committeesList = [] +committeesMap = {} +addedCommittees = [] +c = {} + +for pmc in re.findall(r"\* .+?\s+\(est\. [0-9/]+[^\r\n]+", data): + + #print(pmc) + m = re.search(r"\* (.+?)\s+\(est. ([0-9]+)/([0-9]+)", pmc) + if m: + committeeShortName = m.group(1) + month = m.group(2) + year = m.group(3) + if not re.search(r"Committee", pmc): + if committeeShortName in committeeIds: + committeeId = committeeIds[committeeShortName] + else: + committeeId = committeeShortName.lower().replace(' ', '').replace('.', '') + # Classical committee + committeeName = "Apache %s" % committeeShortName + if committeeName in renamesCommittee2Json: + committeeName = renamesCommittee2Json[committeeName] + #print(committeeShortName) + committeeCount += 1 + + # add committee to committees + committee = {} + committee['id'] = committeeId + if committeeId in group_ids: + group = group_ids[committeeId] + else: + group = committeeId + committee['group'] = group + committee['name'] = committeeName + committee['established'] = "%s-%s" % (year, month) + if group in homepages: + homepage = homepages[group] + else: + homepage = 'http://%s.apache.org/' % group + committee['homepage'] = homepage + # committee committers and PMC members + pmcgroup = "%s-pmc" % group + committers = [] # [ 'login' ] + pmc = [] # [ 'login' ] + for login in people: + p = people[login] + if p['groups']: + if group in p['groups']: + committers.append(login) + if pmcgroup in p['groups']: + pmc.append(login) + else: + print("user %s has no groups" % login) + committers.sort() + pmc.sort() + # don't store committers and PMC members arrays in committee: it's easy to get from groups.json + #committee['pmcs'] = pmc + #committee['committers'] = committers + if len(pmc) == 0: + print('WARN: %s (%s established in %s) has no PMC members LDAP group (id=%s)' % (committeeId, committeeName, committee['established'], pmcgroup)) + if committeeShortName in committees: + committee['chair'] = committees[committeeShortName]['chair'] + if committeeShortName in cycles: + committee['reporting'] = cycles[committeeShortName] + else: + print('WARN: %s not found in reporting cycles' % committeeShortName) + + link = '')] + committee['shortdesc'] = shortdesc + else: + print("WARN: %s (%s) missing from http://www.apache.org/#projects-list" % (committeeShortName, homepage)) + # TODO committee['description'] (or charter) not in committee-info.txt + # TODO committee['retired'] not in committee-info.txt + if committeeId in pmcDataUrls: + committee['rdf'] = pmcDataUrls[committeeId] + else: + print("WARN: %s (%s) missing from committees.xml" % (committeeShortName, committeeId)) + committeesList.append(committee) + committeesMap[committeeId] = committee; + + # generate TLP PMC DOAP file at http://projects-new.apache.org/doap/{committeeId}/pmc.rdf + doap = ET.Element('rdf:RDF', attrib= { 'xml:lang': 'en', + 'xmlns': 'http://usefulinc.com/ns/doap#', + 'xmlns:rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#', + 'xmlns:asfext': 'http://projects.apache.org/ns/asfext#', + 'xmlns:foaf': 'http://xmlns.com/foaf/0.1/' + }) + doap_pmc = ET.SubElement(doap, 'asfext:pmc') + ET.SubElement(doap_pmc, 'asfext:name').text = committeeName + ET.SubElement(doap_pmc, 'homepage', attrib = { 'rdf:resource': homepage }) + doap_chair = ET.SubElement(doap_pmc, 'asfext:chair') + doap_chair_person = ET.SubElement(doap_chair, 'foaf:Person') + ET.SubElement(doap_chair_person, 'foaf:nick').text = committee['chair'] + ET.SubElement(doap_chair_person, 'foaf:name').text = people[committee['chair']]['name'] + directory = "../../site/doap/%s" % committeeId + if not os.path.exists(directory): + os.makedirs(directory) + with open("%s/pmc.rdf" % directory, "w") as f: + f.write(minidom.parseString(ET.tostring(doap, encoding="utf-8")).toprettyxml(indent="\t")) + f.close() + + else: + # Special Committee (Officer's, President's or Board) + print("INFO: %s ignored %s" % (committeeShortName, pmc[pmc.rfind('('):])) + c[committeeName] = True + +print("found %s new committees from %s committees in committee_info.txt" % (len(addedCommittees), committeeCount)) +addedCommittees.sort() +for added in addedCommittees: + print("- %s" % added) + +# detect retired committees to add to committees-retired.json +with open("../../site/json/foundation/committees.json", "r") as f: + committeesPrevious = json.loads(f.read()) + f.close() +with open("../../site/json/foundation/committees-retired.json", "r") as f: + committeesRetired = json.loads(f.read()) + f.close() +for previous in committeesPrevious: + if not previous['id'] in committeesMap: + print("found retired committee: %s" % previous['name']) + previous['retired'] = datetime.date.today().strftime('%Y-%m') + # remove data that is not useful in a retired committee + previous.pop('chair', None) + previous.pop('group', None) + previous.pop('rdf', None) + previous.pop('reporting', None) + committeesRetired.append(previous) + +with open("../../site/json/foundation/committees.json", "w") as f: + f.write(json.dumps(committeesList, sort_keys=True, indent=0)) + f.close() + +with open("../../site/json/foundation/committees-retired.json", "w") as f: + f.write(json.dumps(committeesRetired, sort_keys=True, indent=0)) + f.close() + +with open ("../../site/json/foundation/pmcs.json", "w") as f: + f.write(json.dumps(pmcs, sort_keys=True, indent=0)) + f.close() + +# compare with chairs, for consistency checking +chairs = json.load(open("../../site/json/foundation/chairs.json")) +for chair in chairs: + if chair in renamesChairs2Json: + chair = renamesChairs2Json[chair] + if not chair in c: + print("WARN: %s is in http://www.apache.org/foundation/ but not in committee-info.txt: typo somewhere or retirement in progress?" % chair) Propchange: comdev/projects.apache.org/scripts/import/parsecommittees.py ------------------------------------------------------------------------------ svn:eol-style = native Modified: comdev/projects.apache.org/scripts/import/parseprojects.py URL: http://svn.apache.org/viewvc/comdev/projects.apache.org/scripts/import/parseprojects.py?rev=1690547&r1=1690546&r2=1690547&view=diff ============================================================================== --- comdev/projects.apache.org/scripts/import/parseprojects.py (original) +++ comdev/projects.apache.org/scripts/import/parseprojects.py Sun Jul 12 22:58:51 2015 @@ -1,189 +1,189 @@ -from xml.dom import minidom -import xml.etree.ElementTree as ET -import re, urllib.request -import json -import os -import traceback - -""" - -Reads: -../../data/projects.xml -parseprojects-failures.xml (if exists) -../../site/json/foundation/committees-retired.json - -Writes: -../../site/json/foundation/projects.json -../../site/json/projects/%s.json -../../site/doap/%s/%s.rdf -parseprojects-failures.xml (if failures occurred) - -""" - -projectsList = "../../data/projects.xml"; -save = True; -if os.path.exists("parseprojects-failures.xml"): - projectsList = "parseprojects-failures.xml"; - save = False; -with open(projectsList, "r") as f: - data = f.read() - f.close() -xmldoc = minidom.parseString(data) -itemlist = xmldoc.getElementsByTagName('location') - -siteMap = { - 'hc': 'httpcomponents', - 'ws':'webservices' -} - -def site2committee(siteId): - if siteId in siteMap: - return siteMap[siteId] - return siteId - -with open("../../site/json/foundation/committees-retired.json", "r") as f: - committeesRetired = json.loads(f.read()) - f.close() -retired = [] -for r in committeesRetired: - retired.append(r['id']) - -projects = {} -failures = [] - -def handleChild(el): - retval = None - hasKids = False - for child in list(el): - hasKids = True - attribs = {} - for key in el.attrib: - xkey = re.sub(r"\{.+\}", "", key) - attribs[xkey] = el.attrib[key] - tag = re.sub(r"\{.+\}", "", el.tag) - value = attribs['resource'] if 'resource' in attribs else el.text - if not hasKids: - retval = value - else: - retval = {} - for child in list(el): - k, v = handleChild(child) - retval[k] = v - if k == "location": - retval = v - break - return tag, retval - -for s in itemlist : - url = s.childNodes[0].data - try: - rdf = urllib.request.urlopen(url).read() - rdfxml = ET.fromstring(rdf) - project = rdfxml[0] - pjson = { - 'doap': url - } - prname = None - committeeId = None - projectJsonFilename = None - for el in project: - k, v = handleChild(el) - if not save: - print("+ %s" % k); - if k in pjson and not k in ['name','homepage']: - if type(pjson[k]) is str: - pjson[k] = "%s, %s" % (pjson[k], v) - else: - for xk in v: - pjson[k].append(v[xk]) - else: - if k not in ['release', 'implements', 'repository', 'developer', 'maintainer', 'member', 'helper']: - pjson[k] = v - else: - pjson[k] = [] - for xk in v: - pjson[k].append(v[xk]) - - if pjson['homepage']: - homepage = pjson['homepage'] - m = re.match(r"https?://([^.]+)\.", homepage, re.IGNORECASE) - if m: - siteId = site2committee(m.group(1)) - nn = re.sub("http.+\.apache\.org/?", "", homepage) - if (nn == ""): - projectJsonFilename = siteId - else: - nn = nn.replace('/', ' ').strip().split().pop().replace('-project', '') - if nn.startswith("%s-" % siteId): - projectJsonFilename = nn - else: - projectJsonFilename = "%s-%s" % (siteId, nn) - else: - print("WARN: no homepage defined in %s, pmc = %s" % (url, pjson['pmc'])) - - if pjson['pmc'].startswith('http://attic.apache.org'): - committeeId = 'attic' - elif '.incubator.' in homepage: - committeeId = 'incubator' - else: - committeeId = siteId - if committeeId in retired: - print("WARN: project from a retired committee but PMC not changed to Attic in %s" % url) - committeeId = 'attic' - pjson['pmc'] = committeeId - - # replace category url with id, by removing http://projects.apache.org/category/ - if 'category' in pjson: - pjson['category'] = pjson['category'].replace("http://projects.apache.org/category/", "") - if committeeId == 'attic' and not 'retired' in pjson['category']: - print("WARN: project in Attic but not in 'retired' category: %s" % url) - pjson['category'] = "%s, retired" % pjson['category'] - elif committeeId == 'attic' and not 'retired' in pjson['category']: - print("WARN: project in Attic but not in 'retired' category: %s" % url) - pjson['category'] = "retired" - if projectJsonFilename: - #add = {} - #for k in pjson: - # if pjson[k] != None and type(pjson[k]) is not str: - # for e in pjson[k]: - # add[e] = pjson[k][e] - # pjson[k] = None - - projects[projectJsonFilename] = pjson - #for e in add: - # pjson[e] = add[e] - print("Writing projects/%s.json..." % projectJsonFilename) - with open ("../../site/json/projects/%s.json" % projectJsonFilename, "w") as f: - f.write(json.dumps(pjson, sort_keys=True, indent=0)) - f.close() - # copy project DOAP to /doap//.rdf - with open ("../../site/doap/%s/%s.rdf" % (committeeId, projectJsonFilename), "wb") as f: - f.write(rdf) - f.close() - else: - print("WARN: project ignored since unable to extract project json filename from %s" % url) - except Exception as err: - print("Error when reading %s's doap file %s:" % (prname, url)) - print("-"*60) - traceback.print_exc() - print("-"*60) - failures.append(url) - with open (url.split('/')[-1], "wb") as f: - f.write(rdf) - f.close() - -if save: - print("Writing foundation/projects.json...") - with open ("../../site/json/foundation/projects.json", "w") as f: - f.write(json.dumps(projects, sort_keys=True, indent=0)) - f.close() - -if len(failures) > 0: - with open ("parseprojects-failures.xml", "w") as f: - f.write("\n") - for fail in failures: - f.write("%s\n" % fail) - f.write("\n") - f.close() - -print("Done!") +from xml.dom import minidom +import xml.etree.ElementTree as ET +import re, urllib.request +import json +import os +import traceback + +""" + +Reads: +../../data/projects.xml +parseprojects-failures.xml (if exists) +../../site/json/foundation/committees-retired.json + +Writes: +../../site/json/foundation/projects.json +../../site/json/projects/%s.json +../../site/doap/%s/%s.rdf +parseprojects-failures.xml (if failures occurred) + +""" + +projectsList = "../../data/projects.xml"; +save = True; +if os.path.exists("parseprojects-failures.xml"): + projectsList = "parseprojects-failures.xml"; + save = False; +with open(projectsList, "r") as f: + data = f.read() + f.close() +xmldoc = minidom.parseString(data) +itemlist = xmldoc.getElementsByTagName('location') + +siteMap = { + 'hc': 'httpcomponents', + 'ws':'webservices' +} + +def site2committee(siteId): + if siteId in siteMap: + return siteMap[siteId] + return siteId + +with open("../../site/json/foundation/committees-retired.json", "r") as f: + committeesRetired = json.loads(f.read()) + f.close() +retired = [] +for r in committeesRetired: + retired.append(r['id']) + +projects = {} +failures = [] + +def handleChild(el): + retval = None + hasKids = False + for child in list(el): + hasKids = True + attribs = {} + for key in el.attrib: + xkey = re.sub(r"\{.+\}", "", key) + attribs[xkey] = el.attrib[key] + tag = re.sub(r"\{.+\}", "", el.tag) + value = attribs['resource'] if 'resource' in attribs else el.text + if not hasKids: + retval = value + else: + retval = {} + for child in list(el): + k, v = handleChild(child) + retval[k] = v + if k == "location": + retval = v + break + return tag, retval + +for s in itemlist : + url = s.childNodes[0].data + try: + rdf = urllib.request.urlopen(url).read() + rdfxml = ET.fromstring(rdf) + project = rdfxml[0] + pjson = { + 'doap': url + } + prname = None + committeeId = None + projectJsonFilename = None + for el in project: + k, v = handleChild(el) + if not save: + print("+ %s" % k); + if k in pjson and not k in ['name','homepage']: + if type(pjson[k]) is str: + pjson[k] = "%s, %s" % (pjson[k], v) + else: + for xk in v: + pjson[k].append(v[xk]) + else: + if k not in ['release', 'implements', 'repository', 'developer', 'maintainer', 'member', 'helper']: + pjson[k] = v + else: + pjson[k] = [] + for xk in v: + pjson[k].append(v[xk]) + + if pjson['homepage']: + homepage = pjson['homepage'] + m = re.match(r"https?://([^.]+)\.", homepage, re.IGNORECASE) + if m: + siteId = site2committee(m.group(1)) + nn = re.sub("http.+\.apache\.org/?", "", homepage) + if (nn == ""): + projectJsonFilename = siteId + else: + nn = nn.replace('/', ' ').strip().split().pop().replace('-project', '') + if nn.startswith("%s-" % siteId): + projectJsonFilename = nn + else: + projectJsonFilename = "%s-%s" % (siteId, nn) + else: + print("WARN: no homepage defined in %s, pmc = %s" % (url, pjson['pmc'])) + + if pjson['pmc'].startswith('http://attic.apache.org'): + committeeId = 'attic' + elif '.incubator.' in homepage: + committeeId = 'incubator' + else: + committeeId = siteId + if committeeId in retired: + print("WARN: project from a retired committee but PMC not changed to Attic in %s" % url) + committeeId = 'attic' + pjson['pmc'] = committeeId + + # replace category url with id, by removing http://projects.apache.org/category/ + if 'category' in pjson: + pjson['category'] = pjson['category'].replace("http://projects.apache.org/category/", "") + if committeeId == 'attic' and not 'retired' in pjson['category']: + print("WARN: project in Attic but not in 'retired' category: %s" % url) + pjson['category'] = "%s, retired" % pjson['category'] + elif committeeId == 'attic' and not 'retired' in pjson['category']: + print("WARN: project in Attic but not in 'retired' category: %s" % url) + pjson['category'] = "retired" + if projectJsonFilename: + #add = {} + #for k in pjson: + # if pjson[k] != None and type(pjson[k]) is not str: + # for e in pjson[k]: + # add[e] = pjson[k][e] + # pjson[k] = None + + projects[projectJsonFilename] = pjson + #for e in add: + # pjson[e] = add[e] + print("Writing projects/%s.json..." % projectJsonFilename) + with open ("../../site/json/projects/%s.json" % projectJsonFilename, "w") as f: + f.write(json.dumps(pjson, sort_keys=True, indent=0)) + f.close() + # copy project DOAP to /doap//.rdf + with open ("../../site/doap/%s/%s.rdf" % (committeeId, projectJsonFilename), "wb") as f: + f.write(rdf) + f.close() + else: + print("WARN: project ignored since unable to extract project json filename from %s" % url) + except Exception as err: + print("Error when reading %s's doap file %s:" % (prname, url)) + print("-"*60) + traceback.print_exc() + print("-"*60) + failures.append(url) + with open (url.split('/')[-1], "wb") as f: + f.write(rdf) + f.close() + +if save: + print("Writing foundation/projects.json...") + with open ("../../site/json/foundation/projects.json", "w") as f: + f.write(json.dumps(projects, sort_keys=True, indent=0)) + f.close() + +if len(failures) > 0: + with open ("parseprojects-failures.xml", "w") as f: + f.write("\n") + for fail in failures: + f.write("%s\n" % fail) + f.write("\n") + f.close() + +print("Done!") Propchange: comdev/projects.apache.org/scripts/import/parseprojects.py ------------------------------------------------------------------------------ svn:eol-style = native