Commit f736ec56 authored by Dimitri Podborski's avatar Dimitri Podborski 😂
Browse files

Merge branch 'ff-victims' into 'master'

Ff victims

See merge request !3
parents 1acb527d f0dada7b
......@@ -5,10 +5,7 @@ import requests
import bs4
from datetime import datetime
from enum import Enum, unique
# "curl -s -X POST -d 'id=" + docID + "&id_meeting=' -u $MPEG_LOGIN:$MPEG_PWD"
# "curl -s -X POST -d 'search_title=&search_number=" + contrNr + "&search_category=&search_author=&search_id_group=1&search_sub_group=1&id_meeting=&submit=Search&meeting=' -u $MPEG_LOGIN:$MPEG_PWD"
import re
MEETINGS_URL = urljoin(BASE_URL, 'all_meeting.php')
......@@ -60,6 +57,37 @@ class MDMSParser:
return []
return meetings
def parse_author_entry(self, author_entry):
Search entry string for an email, remove it from the name and clean up
Return a tuple('name', 'email')
author_entry = author_entry.strip()
if len(author_entry) == 0:
return None
email = None
match ='[\w\.-]+@[\w\.-]+', author_entry)
if match: # email found
email =
author_entry = author_entry.replace(email, '') # remove email from the name
# remove everything what is inside () or []
author_entry = re.sub(r'[\(\[].*?[\)\]]', '', author_entry)
# remove all non ASCII characters
author_entry = re.sub(r'[^\x00-\x7F]+', '', author_entry)
author_entry = author_entry.strip()
return (author_entry, email)
def try_parsing_date(self, text):
Try parsing the timestamp, if not possible return None
for fmt in ('%Y-%m-%d %H:%M:%S', 'Y-%m-%d'):
return datetime.strptime(text.strip(), fmt)
except ValueError:
return None
def parse_input_docs(self, html):
docs = []
soup = bs4.BeautifulSoup(html, features = 'lxml')
......@@ -93,26 +121,45 @@ class MDMSParser:
mdms_id = int(parse_qs(parsed_href.query)['id'][0])
# get timestamp of the last uploaded version
last_version_uploaded = None
if len(cols[2].text.strip()) > 0:
last_version_uploaded = datetime.strptime(cols[2].text.strip(), '%Y-%m-%d %H:%M:%S')
last_version_uploaded = self.try_parsing_date(cols[2].text)
created_timestamp = self.try_parsing_date(cols[1].text)
# get authors
authors = []
for entry in cols[5].contents:
if isinstance(entry, bs4.Tag):
parsed_href = urlparse(entry['href'])
parsed_href = entry.text
email = None
name = None
parsed_href = urlparse(entry['href'])
email = parsed_href.path
author_data = self.parse_author_entry(entry.text)
name = entry.text
if author_data:
name = author_data[0] # clean version of the name
# sometimes people type name and email wrong in MDMS and they are flipped
if not '@' in email and author_data[1]:
name = email
email = author_data[1]
except KeyError:
# sometimes Author's field is formatted with fake html tags.
print('Bad HTML format in Authors field: ', entry)
name = entry.text
'name': entry.text,
'email': parsed_href.path
'name': name,
'email': email
name = entry.string.replace(',', '').strip()
if len(name) > 0:
'name': entry.string.replace(',', '').strip(),
'email': ''
for author_entry in entry.string.replace(' and ', ',').split(','):
author_data = self.parse_author_entry(author_entry)
if author_data:
'name': author_data[0],
'email': author_data[1]
# get latest document link (if available)
latest_url = None
......@@ -123,7 +170,7 @@ class MDMSParser:
'mdms_id': mdms_id,
'document': cols[0].text,
'created': datetime.strptime(cols[1].text, '%Y-%m-%d %H:%M:%S'),
'created': created_timestamp,
'last_version_uploaded': last_version_uploaded,
'sub_group_text': cols[3].text,
'title': cols[4].text.strip(),
from automation import mdms
This is an example how to get a list of authors who had input documents
with a specific substring in the title
# meetings with the smaller number will be ignored
TITLE_WHITELIST= ['ISOBMFF', 'BMFF', '14496-12', 'file format', 'MP4', 'ISOBMF', 'ISO Base']
authors = {}
def document_is_relevant(title, filter_strings):
return any(sub.lower().strip() in title.lower().strip() for sub in filter_strings)
meetings = mdms.get_meetings()
for meeting in meetings:
if meeting['number'] < LAST_MEETING_NUMBER:
print('process meegting', meeting['number'])
input_docs = mdms.get_input_documents(meeting['id'])
for doc in input_docs:
if not document_is_relevant(doc['title'], TITLE_WHITELIST):
for author in doc['authors']:
if author['name'] in authors:
authors[author['name']] = [doc]
# print author names and number of contributions
for author in authors:
print(author, ' :', len(authors[author]))
\ No newline at end of file
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment