diff --git a/automation/mdms.py b/automation/mdms.py index 8f0391ad206df8ae9212564f758e5f49f2850af2..397966a04368fbed9e0e99dac43a7b956b538b89 100644 --- a/automation/mdms.py +++ b/automation/mdms.py @@ -5,10 +5,7 @@ import requests import bs4 from datetime import datetime from enum import Enum, unique - -# "curl -s -X POST -d 'id=" + docID + "&id_meeting=' -u $MPEG_LOGIN:$MPEG_PWD https://dms.mpeg.expert/doc_end_user/current_document.php" -# "curl -s -X POST -d 'search_title=&search_number=" + contrNr + "&search_category=&search_author=&search_id_group=1&search_sub_group=1&id_meeting=&submit=Search&meeting=' -u $MPEG_LOGIN:$MPEG_PWD https://dms.mpeg.expert/doc_end_user/searchAcross.php" - +import re BASE_URL = 'https://dms.mpeg.expert/doc_end_user/' MEETINGS_URL = urljoin(BASE_URL, 'all_meeting.php') @@ -60,6 +57,37 @@ class MDMSParser: return [] return meetings + def parse_author_entry(self, author_entry): + ''' + Search entry string for an email, remove it from the name and clean up + Return a tuple('name', 'email') + ''' + author_entry = author_entry.strip() + if len(author_entry) == 0: + return None + email = None + match = re.search(r'[\w\.-]+@[\w\.-]+', author_entry) + if match: # email found + email = match.group(0) + author_entry = author_entry.replace(email, '') # remove email from the name + # remove everything what is inside () or [] + author_entry = re.sub(r'[\(\[].*?[\)\]]', '', author_entry) + # remove all non ASCII characters + author_entry = re.sub(r'[^\x00-\x7F]+', '', author_entry) + author_entry = author_entry.strip() + return (author_entry, email) + + def try_parsing_date(self, text): + ''' + Try parsing the timestamp, if not possible return None + ''' + for fmt in ('%Y-%m-%d %H:%M:%S', 'Y-%m-%d'): + try: + return datetime.strptime(text.strip(), fmt) + except ValueError: + pass + return None + def parse_input_docs(self, html): docs = [] soup = bs4.BeautifulSoup(html, features = 'lxml') @@ -93,26 +121,45 @@ class MDMSParser: mdms_id = int(parse_qs(parsed_href.query)['id'][0]) # get timestamp of the last uploaded version - last_version_uploaded = None - if len(cols[2].text.strip()) > 0: - last_version_uploaded = datetime.strptime(cols[2].text.strip(), '%Y-%m-%d %H:%M:%S') - + last_version_uploaded = self.try_parsing_date(cols[2].text) + created_timestamp = self.try_parsing_date(cols[1].text) + # get authors authors = [] for entry in cols[5].contents: if isinstance(entry, bs4.Tag): - parsed_href = urlparse(entry['href']) + parsed_href = entry.text + email = None + name = None + try: + parsed_href = urlparse(entry['href']) + email = parsed_href.path + author_data = self.parse_author_entry(entry.text) + name = entry.text + if author_data: + name = author_data[0] # clean version of the name + # sometimes people type name and email wrong in MDMS and they are flipped + if not '@' in email and author_data[1]: + name = email + email = author_data[1] + except KeyError: + # sometimes Author's field is formatted with fake html tags. + print('Bad HTML format in Authors field: ', entry) + name = entry.text + pass + authors.append({ - 'name': entry.text, - 'email': parsed_href.path + 'name': name, + 'email': email }) else: - name = entry.string.replace(',', '').strip() - if len(name) > 0: - authors.append({ - 'name': entry.string.replace(',', '').strip(), - 'email': '' - }) + for author_entry in entry.string.replace(' and ', ',').split(','): + author_data = self.parse_author_entry(author_entry) + if author_data: + authors.append({ + 'name': author_data[0], + 'email': author_data[1] + }) # get latest document link (if available) latest_url = None @@ -123,7 +170,7 @@ class MDMSParser: docs.append({ 'mdms_id': mdms_id, 'document': cols[0].text, - 'created': datetime.strptime(cols[1].text, '%Y-%m-%d %H:%M:%S'), + 'created': created_timestamp, 'last_version_uploaded': last_version_uploaded, 'sub_group_text': cols[3].text, 'title': cols[4].text.strip(), diff --git a/find_contributors.py b/find_contributors.py new file mode 100644 index 0000000000000000000000000000000000000000..c78d5d2b65b25c0517d8fe49ed51949a8e5d71f9 --- /dev/null +++ b/find_contributors.py @@ -0,0 +1,35 @@ +from automation import mdms + +''' +This is an example how to get a list of authors who had input documents +with a specific substring in the title +''' + +# meetings with the smaller number will be ignored +LAST_MEETING_NUMBER = 42 + +TITLE_WHITELIST= ['ISOBMFF', 'BMFF', '14496-12', 'file format', 'MP4', 'ISOBMF', 'ISO Base'] + +authors = {} + +def document_is_relevant(title, filter_strings): + return any(sub.lower().strip() in title.lower().strip() for sub in filter_strings) + +meetings = mdms.get_meetings() +for meeting in meetings: + if meeting['number'] < LAST_MEETING_NUMBER: + continue + print('process meegting', meeting['number']) + input_docs = mdms.get_input_documents(meeting['id']) + for doc in input_docs: + if not document_is_relevant(doc['title'], TITLE_WHITELIST): + continue + for author in doc['authors']: + if author['name'] in authors: + authors[author['name']].append(doc) + else: + authors[author['name']] = [doc] + +# print author names and number of contributions +for author in authors: + print(author, ' :', len(authors[author])) \ No newline at end of file