Commit 70dcbd7a authored by Dimitri Podborski's avatar Dimitri Podborski
Browse files

implement find_documents API

parent 094c7503
......@@ -99,7 +99,7 @@ class MDMSParser:
pass
return None
def parse_input_docs(self, html):
def parse_search_docs(self, html):
docs = []
soup = bs4.BeautifulSoup(html, features='lxml')
for i in soup.select('br'): # replace <br/> with a space, it makes checking headers easier
......@@ -109,16 +109,15 @@ class MDMSParser:
if not form:
print('Error: No form with id="documents" found. Did MDMS layout change?')
return []
table = form.find('table')
if not table:
print('Error: No table found in form. Did MDMS layout change?')
return []
table_main = form.find('table')
if not table_main:
print('Error: No main table element found. Did MDMS layout change?')
return None
rows = table_main.find_all('tr', recursive=False)
rows = table.find_all('tr', recursive=False)
for n in range(len(rows)):
if n == 0: # check header first
header = ['number', 'created', 'uploaded', 'Group Working Group / SubGroup', 'title', 'source',
'download']
header = ['number', 'meeting', 'created', 'Uploaded', 'Group SubGroup', 'title', 'Author(s)', '']
if not self.check_table_header(header, rows[n]):
print('Error: Wrong table header. Did layout of MDMS change?')
return []
......@@ -134,12 +133,37 @@ class MDMSParser:
container_url = urljoin(DOCUMENT_URL, '?id={}'.format(mdms_id))
# get timestamp of the last uploaded version
last_version_uploaded = self.try_parsing_date(cols[2].text)
created_timestamp = self.try_parsing_date(cols[1].text)
created_timestamp = self.try_parsing_date(cols[2].text)
uploaded_timestamp = self.try_parsing_date(cols[3].text)
# get authors
authors = self._get_authors(cols[6])
# get latest document link (if available)
latest_url = None
if len(cols) == 8:
if not cols[7].find('a') is None:
latest_url = urljoin(CURRENT_MEETING_URL, cols[7].find('a')['href'])
docs.append({
'mdms_id': mdms_id,
'document': cols[0].text,
'meeting': cols[1].text,
'created': created_timestamp,
'last_version_uploaded': uploaded_timestamp,
'sub_group_text': cols[4].text,
'title': cols[5].text.strip(),
'authors': authors,
'latest_version_url': latest_url,
'container': container_url
})
except: # TODO: catch properly
print('Error: Could not parse search documents data. Did MDMS layout change?')
return []
return docs
def _get_authors(self, col):
authors = []
for entry in cols[5].contents:
for entry in col.contents:
if isinstance(entry, bs4.Tag):
email = None
try:
......@@ -171,6 +195,48 @@ class MDMSParser:
'name': author_data[0],
'email': author_data[1]
})
return authors
def parse_input_docs(self, html):
docs = []
soup = bs4.BeautifulSoup(html, features='lxml')
for i in soup.select('br'): # replace <br/> with a space, it makes checking headers easier
i.replace_with(' ')
form = soup.find('body').find('form', id='documents')
if not form:
print('Error: No form with id="documents" found. Did MDMS layout change?')
return []
table = form.find('table')
if not table:
print('Error: No table found in form. Did MDMS layout change?')
return []
rows = table.find_all('tr', recursive=False)
for n in range(len(rows)):
if n == 0: # check header first
header = ['number', 'created', 'uploaded', 'Group Working Group / SubGroup', 'title', 'source',
'download']
if not self.check_table_header(header, rows[n]):
print('Error: Wrong table header. Did layout of MDMS change?')
return []
continue
cols = rows[n].find_all('td', recursive=False)
try:
if len(cols[0].text) == 0:
continue
# get document ID on MDMS
parsed_href = urlparse(cols[0].a['href'])
mdms_id = int(parse_qs(parsed_href.query)['id'][0])
container_url = urljoin(DOCUMENT_URL, '?id={}'.format(mdms_id))
# get timestamp of the last uploaded version
last_version_uploaded = self.try_parsing_date(cols[2].text)
created_timestamp = self.try_parsing_date(cols[1].text)
# get authors
authors = self._get_authors(cols[5])
# get latest document link (if available)
latest_url = None
......@@ -455,6 +521,22 @@ def find_documents(title='',
subgroup=Subgroup.ALL):
"""
Find documents using the search URL.
TODO: Fire a POST request to SEARCH_URL and parse the result
"""
raise NotImplemented
query = '?search_title={}' \
'&search_number={}'\
'&search_category={}' \
'&search_author={}'\
'&search_id_group={}' \
'&search_sub_group={}' \
'&id_meeting=0' \
'&submit=Search' \
'&meeting=0'.format(title, number, category.value, author, group.value, subgroup.value)
url = urljoin(SEARCH_URL, query)
response = requests.post(url, auth=(MPEG_LOGIN, MPEG_PWD))
if not response.status_code == 200:
print('HTTP response {} != 200'.format(response.status_code))
print('\t{}'.format(response.text.replace('\n', '\n\t')))
return None
parser = MDMSParser()
return parser.parse_search_docs(response.text)
......@@ -210,7 +210,6 @@ def open_issues(table_entries, test, gitlab_members, meeting_start):
if 'y' in user_input:
new_description = helpers.get_updated_issue_description(issue_with_meta.description, document,
document_details)
current_labels = issue_with_meta.labels
if 'DocAvailable' not in issue_with_meta.labels:
issue_with_meta.labels.append('DocAvailable')
if not test:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment