Commit 70dcbd7a authored by Dimitri Podborski's avatar Dimitri Podborski
Browse files

implement find_documents API

parent 094c7503
......@@ -99,6 +99,104 @@ class MDMSParser:
pass
return None
def parse_search_docs(self, html):
docs = []
soup = bs4.BeautifulSoup(html, features='lxml')
for i in soup.select('br'): # replace <br/> with a space, it makes checking headers easier
i.replace_with(' ')
form = soup.find('body').find('form', id='documents')
if not form:
print('Error: No form with id="documents" found. Did MDMS layout change?')
return []
table_main = form.find('table')
if not table_main:
print('Error: No main table element found. Did MDMS layout change?')
return None
rows = table_main.find_all('tr', recursive=False)
for n in range(len(rows)):
if n == 0: # check header first
header = ['number', 'meeting', 'created', 'Uploaded', 'Group SubGroup', 'title', 'Author(s)', '']
if not self.check_table_header(header, rows[n]):
print('Error: Wrong table header. Did layout of MDMS change?')
return []
continue
cols = rows[n].find_all('td', recursive=False)
try:
if len(cols[0].text) == 0:
continue
# get document ID on MDMS
parsed_href = urlparse(cols[0].a['href'])
mdms_id = int(parse_qs(parsed_href.query)['id'][0])
container_url = urljoin(DOCUMENT_URL, '?id={}'.format(mdms_id))
# get timestamp of the last uploaded version
created_timestamp = self.try_parsing_date(cols[2].text)
uploaded_timestamp = self.try_parsing_date(cols[3].text)
authors = self._get_authors(cols[6])
# get latest document link (if available)
latest_url = None
if len(cols) == 8:
if not cols[7].find('a') is None:
latest_url = urljoin(CURRENT_MEETING_URL, cols[7].find('a')['href'])
docs.append({
'mdms_id': mdms_id,
'document': cols[0].text,
'meeting': cols[1].text,
'created': created_timestamp,
'last_version_uploaded': uploaded_timestamp,
'sub_group_text': cols[4].text,
'title': cols[5].text.strip(),
'authors': authors,
'latest_version_url': latest_url,
'container': container_url
})
except: # TODO: catch properly
print('Error: Could not parse search documents data. Did MDMS layout change?')
return []
return docs
def _get_authors(self, col):
authors = []
for entry in col.contents:
if isinstance(entry, bs4.Tag):
email = None
try:
parsed_href = urlparse(entry['href'])
email = parsed_href.path
author_data = self.parse_author_entry(entry.text)
name = entry.text
if author_data:
name = author_data[0] # clean version of the name
# sometimes people type name and email wrong in MDMS and they are flipped
if '@' not in email and author_data[1]:
name = email
email = author_data[1]
except KeyError:
# sometimes Author's field is formatted with fake html tags.
print('Bad HTML format in Authors field: ', entry)
name = entry.text
pass
authors.append({
'name': name,
'email': email
})
else:
for author_entry in entry.string.replace(' and ', ',').split(','):
author_data = self.parse_author_entry(author_entry)
if author_data:
authors.append({
'name': author_data[0],
'email': author_data[1]
})
return authors
def parse_input_docs(self, html):
docs = []
soup = bs4.BeautifulSoup(html, features='lxml')
......@@ -138,39 +236,7 @@ class MDMSParser:
created_timestamp = self.try_parsing_date(cols[1].text)
# get authors
authors = []
for entry in cols[5].contents:
if isinstance(entry, bs4.Tag):
email = None
try:
parsed_href = urlparse(entry['href'])
email = parsed_href.path
author_data = self.parse_author_entry(entry.text)
name = entry.text
if author_data:
name = author_data[0] # clean version of the name
# sometimes people type name and email wrong in MDMS and they are flipped
if '@' not in email and author_data[1]:
name = email
email = author_data[1]
except KeyError:
# sometimes Author's field is formatted with fake html tags.
print('Bad HTML format in Authors field: ', entry)
name = entry.text
pass
authors.append({
'name': name,
'email': email
})
else:
for author_entry in entry.string.replace(' and ', ',').split(','):
author_data = self.parse_author_entry(author_entry)
if author_data:
authors.append({
'name': author_data[0],
'email': author_data[1]
})
authors = self._get_authors(cols[5])
# get latest document link (if available)
latest_url = None
......@@ -455,6 +521,22 @@ def find_documents(title='',
subgroup=Subgroup.ALL):
"""
Find documents using the search URL.
TODO: Fire a POST request to SEARCH_URL and parse the result
"""
raise NotImplemented
query = '?search_title={}' \
'&search_number={}'\
'&search_category={}' \
'&search_author={}'\
'&search_id_group={}' \
'&search_sub_group={}' \
'&id_meeting=0' \
'&submit=Search' \
'&meeting=0'.format(title, number, category.value, author, group.value, subgroup.value)
url = urljoin(SEARCH_URL, query)
response = requests.post(url, auth=(MPEG_LOGIN, MPEG_PWD))
if not response.status_code == 200:
print('HTTP response {} != 200'.format(response.status_code))
print('\t{}'.format(response.text.replace('\n', '\n\t')))
return None
parser = MDMSParser()
return parser.parse_search_docs(response.text)
......@@ -210,7 +210,6 @@ def open_issues(table_entries, test, gitlab_members, meeting_start):
if 'y' in user_input:
new_description = helpers.get_updated_issue_description(issue_with_meta.description, document,
document_details)
current_labels = issue_with_meta.labels
if 'DocAvailable' not in issue_with_meta.labels:
issue_with_meta.labels.append('DocAvailable')
if not test:
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment