implement get_document_details

allows you to fetch more infos about a document. e.g.: Abstract
......@@ -181,6 +181,102 @@ class MDMSParser:
print('Error: Could not parse input documents data. Did MDMS layout change?')
return []
return docs
def parse_document_details(self, html):
details = {
'submitted_by': None,
'title': None,
'authors_string': None,
'organizations': None,
'abstract': None,
'related_docs': None,
'ahg': None,
'sub_group': None,
'group': None,
'standard': None,
'activity': None,
'documents': []
soup = bs4.BeautifulSoup(html, features = 'lxml')
for i in ('br'): # replace <br/> with a space, it makes checking headers easier
i.replace_with(' ')
# do some checks if format is ok
table_main = soup.find('body').find('table')
if not table_main:
print('Error: No main table element found. Did MDMS layout change?')
return None
rows_main = table_main.find_all('tr', recursive=False)
if not len(rows_main) == 3:
print('Error: Main table should have 3 rows. Did MDMS layout change?')
return None
tables = rows_main[0].find_all('table')
if not len(tables) == 2:
print('Error: First row in the main table should have only 2 tables in it. Did MDMS layout change?')
return None
rows = tables[1].find_all('tr', recursive=False)
# parse
for n in range(len(rows)):
cols = rows[n].find_all('td', recursive=False)
attribute = cols[0].text.strip().lower()
entry = cols[1].text.strip()
if len(entry) == 0:
continue # skip all empty fields
if 'submitted by' in attribute:
parsed_href = urlparse(cols[1].a['href'])
details['submitted_by'] = {'name': entry, 'email': parsed_href.path}
elif 'title' in attribute:
details['title'] = entry
elif 'authors' in attribute:
details['authors_string'] = entry
elif 'organizations' in attribute:
details['organizations'] = entry
elif 'abstract' in attribute:
details['abstract'] = entry
elif 'related contributions' in attribute:
details['related_docs'] = entry
elif 'ahg' in attribute:
details['ahg'] = entry
elif 'sub group' in attribute:
details['sub_group'] = entry
elif 'group' in attribute:
details['group'] = entry
elif 'standard' in attribute:
details['standard'] = entry
elif 'activity' in attribute:
details['activity'] = entry
elif 'document' in attribute:
rel_path = None
version = None
timestamp = None
for entry in cols[1].contents:
if isinstance(entry, bs4.Tag):
parsed_href = urlparse(entry['href'])
rel_path = parsed_href.path
except KeyError:
entry = entry.string.strip()
if len(entry) == 0:
'rel_path': rel_path,
'version': version,
'timestamp': timestamp
rel_path = None
version = None
timestamp = None
pos1 = entry.find('(version')
pos2 = entry.find('- date', pos1+8)
pos3 = entry.find(')', pos2+6)
if pos1 < 0 or pos2 < 0 or pos3 < 0:
version = int(entry[pos1+8:pos2].strip())
timestamp = self.try_parsing_date(entry[pos2+6:pos3])
return details
def check_table_header(self, template, header_row):
......@@ -320,9 +416,28 @@ def get_input_documents(meeting_id, standard=Standard.ALL, subgroup=Subgroup.ALL
def get_document_details(document_id):
Get more details about a docuemt.
TODO: Fire a POST request to DOCUMENT_URL and parse the result.
{'submitted_by': {'name', 'email'}, 'title', 'authors_string', 'organizations', 'abstract', 'related_docs', 'ahg', 'sub_group', 'group', 'standard', 'activity', 'documents': [{'rel_path', 'version', 'timestamp'}, ... ]}
return None
debug = False # remove this after dubugging
if not debug:
query = '?id={}'.format(document_id)
url = urljoin(DOCUMENT_URL, query)
response =, auth=(MPEG_LOGIN, MPEG_PWD))
if not response.status_code == 200:
print('HTTP response {} != 200'.format(response.status_code))
print('\t{}'.format(response.text.replace('\n', '\n\t')))
return []
parser = MDMSParser()
with open('input.html', 'w') as text_file:
parser = MDMSParser()
return parser.parse_document_details(response.text)
with open('input.html', 'r') as text_file:
data =
parser = MDMSParser()
return parser.parse_document_details(data)
def find_documents( title = '',
number = '',
