Commit af2702e5 authored by Dimitri Podborski's avatar Dimitri Podborski
Browse files

improve parsing of authors field

also some dates for old meetings could not be parsed, I also fixed it.
Add an example on how to find a list of people who contributed to
specific standard (based on the title of the input doc)
parent 1acb527d
...@@ -5,10 +5,7 @@ import requests ...@@ -5,10 +5,7 @@ import requests
import bs4 import bs4
from datetime import datetime from datetime import datetime
from enum import Enum, unique from enum import Enum, unique
import re
# "curl -s -X POST -d 'id=" + docID + "&id_meeting=' -u $MPEG_LOGIN:$MPEG_PWD https://dms.mpeg.expert/doc_end_user/current_document.php"
# "curl -s -X POST -d 'search_title=&search_number=" + contrNr + "&search_category=&search_author=&search_id_group=1&search_sub_group=1&id_meeting=&submit=Search&meeting=' -u $MPEG_LOGIN:$MPEG_PWD https://dms.mpeg.expert/doc_end_user/searchAcross.php"
BASE_URL = 'https://dms.mpeg.expert/doc_end_user/' BASE_URL = 'https://dms.mpeg.expert/doc_end_user/'
MEETINGS_URL = urljoin(BASE_URL, 'all_meeting.php') MEETINGS_URL = urljoin(BASE_URL, 'all_meeting.php')
...@@ -60,6 +57,37 @@ class MDMSParser: ...@@ -60,6 +57,37 @@ class MDMSParser:
return [] return []
return meetings return meetings
def parse_author_entry(self, author_entry):
'''
Search entry string for an email, remove it from the name and clean up
Return a tuple('name', 'email')
'''
author_entry = author_entry.strip()
if len(author_entry) == 0:
return None
email = None
match = re.search(r'[\w\.-]+@[\w\.-]+', author_entry)
if match: # email found
email = match.group(0)
author_entry = author_entry.replace(email, '') # remove email from the name
# remove everything what is inside () or []
author_entry = re.sub(r'[\(\[].*?[\)\]]', '', author_entry)
# remove all non ASCII characters
author_entry = re.sub(r'[^\x00-\x7F]+', '', author_entry)
author_entry = author_entry.strip()
return (author_entry, email)
def try_parsing_date(self, text):
'''
Try parsing the timestamp, if not possible return None
'''
for fmt in ('%Y-%m-%d %H:%M:%S', 'Y-%m-%d'):
try:
return datetime.strptime(text.strip(), fmt)
except ValueError:
pass
return None
def parse_input_docs(self, html): def parse_input_docs(self, html):
docs = [] docs = []
soup = bs4.BeautifulSoup(html, features = 'lxml') soup = bs4.BeautifulSoup(html, features = 'lxml')
...@@ -93,26 +121,45 @@ class MDMSParser: ...@@ -93,26 +121,45 @@ class MDMSParser:
mdms_id = int(parse_qs(parsed_href.query)['id'][0]) mdms_id = int(parse_qs(parsed_href.query)['id'][0])
# get timestamp of the last uploaded version # get timestamp of the last uploaded version
last_version_uploaded = None last_version_uploaded = self.try_parsing_date(cols[2].text)
if len(cols[2].text.strip()) > 0: created_timestamp = self.try_parsing_date(cols[1].text)
last_version_uploaded = datetime.strptime(cols[2].text.strip(), '%Y-%m-%d %H:%M:%S')
# get authors # get authors
authors = [] authors = []
for entry in cols[5].contents: for entry in cols[5].contents:
if isinstance(entry, bs4.Tag): if isinstance(entry, bs4.Tag):
parsed_href = urlparse(entry['href']) parsed_href = entry.text
email = None
name = None
try:
parsed_href = urlparse(entry['href'])
email = parsed_href.path
author_data = self.parse_author_entry(entry.text)
name = entry.text
if author_data:
name = author_data[0] # clean version of the name
# sometimes people type name and email wrong in MDMS and they are flipped
if not '@' in email and author_data[1]:
name = email
email = author_data[1]
except KeyError:
# sometimes Author's field is formatted with fake html tags.
print('Bad HTML format in Authors field: ', entry)
name = entry.text
pass
authors.append({ authors.append({
'name': entry.text, 'name': name,
'email': parsed_href.path 'email': email
}) })
else: else:
name = entry.string.replace(',', '').strip() for author_entry in entry.string.replace(' and ', ',').split(','):
if len(name) > 0: author_data = self.parse_author_entry(author_entry)
authors.append({ if author_data:
'name': entry.string.replace(',', '').strip(), authors.append({
'email': '' 'name': author_data[0],
}) 'email': author_data[1]
})
# get latest document link (if available) # get latest document link (if available)
latest_url = None latest_url = None
...@@ -123,7 +170,7 @@ class MDMSParser: ...@@ -123,7 +170,7 @@ class MDMSParser:
docs.append({ docs.append({
'mdms_id': mdms_id, 'mdms_id': mdms_id,
'document': cols[0].text, 'document': cols[0].text,
'created': datetime.strptime(cols[1].text, '%Y-%m-%d %H:%M:%S'), 'created': created_timestamp,
'last_version_uploaded': last_version_uploaded, 'last_version_uploaded': last_version_uploaded,
'sub_group_text': cols[3].text, 'sub_group_text': cols[3].text,
'title': cols[4].text.strip(), 'title': cols[4].text.strip(),
......
from automation import mdms
import re
'''
This is an example how to get a list of authors who had input documents
with a specific substring in the title
'''
# meetings with the smaller number will be ignored
LAST_MEETING_NUMBER = 42
TITLE_WHITELIST= ['ISOBMFF', 'BMFF', '14496-12', 'file format', 'MP4', 'ISOBMF', 'ISO Base']
authors = {}
def document_is_relevant(title, filter_strings):
return any(sub.lower().strip() in title.lower().strip() for sub in filter_strings)
meetings = mdms.get_meetings()
for meeting in meetings:
if meeting['number'] < LAST_MEETING_NUMBER:
continue
print('process meegting', meeting['number'])
input_docs = mdms.get_input_documents(meeting['id'])
for doc in input_docs:
if not document_is_relevant(doc['title'], TITLE_WHITELIST):
continue
for author in doc['authors']:
if author['name'] in authors:
authors[author['name']].append(doc)
else:
authors[author['name']] = [doc]
# print author names and number of contributions
for author in authors:
print(author, ' :', len(authors[author]))
\ No newline at end of file
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment