Commit b2bc6cc5 authored by Dimitri Podborski's avatar Dimitri Podborski
Browse files

add container url to docs, docinfo rel to absolute

Adding container URL to each document
Change relative url to absolute URL in get_document_details
Clean up a bit
parent a0be21e9
......@@ -119,6 +119,7 @@ class MDMSParser:
# get document ID on MDMS
parsed_href = urlparse(cols[0].a['href'])
mdms_id = int(parse_qs(parsed_href.query)['id'][0])
container_url = urljoin(DOCUMENT_URL, '?id={}'.format(mdms_id))
# get timestamp of the last uploaded version
last_version_uploaded = self.try_parsing_date(cols[2].text)
......@@ -175,7 +176,8 @@ class MDMSParser:
'sub_group_text': cols[3].text,
'title': cols[4].text.strip(),
'authors': authors,
'latest_version_url': latest_url
'latest_version_url': latest_url,
'container': container_url
})
except: # TODO: catch properly
print('Error: Could not parse input documents data. Did MDMS layout change?')
......@@ -261,7 +263,7 @@ class MDMSParser:
entry = entry.string.strip()
if len(entry) == 0:
details['documents'].append({
'rel_path': rel_path,
'path': rel_path,
'version': version,
'timestamp': timestamp
})
......@@ -393,51 +395,33 @@ def get_input_documents(meeting_id, standard=Standard.ALL, subgroup=Subgroup.ALL
[{'mdms_id', 'document', 'created', 'last_version_uploaded', 'sub_group_text', 'title', 'authors', 'latest_version_url'}, ...]
'''
debug = False # remove this after dubugging
if not debug:
query = _get_query_string(meeting_id, SearchCategory.INPUT, standard, subgroup)
url = urljoin(CURRENT_MEETING_URL, query)
response = requests.get(url, auth=(MPEG_LOGIN, MPEG_PWD))
if not response.status_code == 200:
print('HTTP response {} != 200'.format(response.status_code))
print('\t{}'.format(response.text.replace('\n', '\n\t')))
return []
with open('input.html', 'w') as text_file:
text_file.write(response.text)
parser = MDMSParser()
return parser.parse_input_docs(response.text)
else:
with open('input.html', 'r') as text_file:
data = text_file.read()
parser = MDMSParser()
return parser.parse_input_docs(data)
query = _get_query_string(meeting_id, SearchCategory.INPUT, standard, subgroup)
url = urljoin(CURRENT_MEETING_URL, query)
response = requests.get(url, auth=(MPEG_LOGIN, MPEG_PWD))
if not response.status_code == 200:
print('HTTP response {} != 200'.format(response.status_code))
print('\t{}'.format(response.text.replace('\n', '\n\t')))
return []
parser = MDMSParser()
return parser.parse_input_docs(response.text)
def get_document_details(document_id):
'''
Get more details about a docuemt.
{'submitted_by': {'name', 'email'}, 'title', 'authors_string', 'organizations', 'abstract', 'related_docs', 'ahg', 'sub_group', 'group', 'standard', 'activity', 'documents': [{'rel_path', 'version', 'timestamp'}, ... ]}
{'submitted_by': {'name', 'email'}, 'title', 'authors_string', 'organizations', 'abstract', 'related_docs', 'ahg', 'sub_group', 'group', 'standard', 'activity', 'documents': [{'path', 'version', 'timestamp'}, ... ]}
'''
debug = False # remove this after dubugging
if not debug:
query = '?id={}'.format(document_id)
url = urljoin(DOCUMENT_URL, query)
response = requests.post(url, auth=(MPEG_LOGIN, MPEG_PWD))
if not response.status_code == 200:
print('HTTP response {} != 200'.format(response.status_code))
print('\t{}'.format(response.text.replace('\n', '\n\t')))
return []
parser = MDMSParser()
with open('input.html', 'w') as text_file:
text_file.write(response.text)
parser = MDMSParser()
return parser.parse_document_details(response.text)
else:
with open('input.html', 'r') as text_file:
data = text_file.read()
parser = MDMSParser()
return parser.parse_document_details(data)
query = '?id={}'.format(document_id)
url = urljoin(DOCUMENT_URL, query)
response = requests.post(url, auth=(MPEG_LOGIN, MPEG_PWD))
if not response.status_code == 200:
print('HTTP response {} != 200'.format(response.status_code))
print('\t{}'.format(response.text.replace('\n', '\n\t')))
return []
parser = MDMSParser()
details = parser.parse_document_details(response.text)
for n in range(len(details['documents'])):
details['documents'][n]['path'] = urljoin(DOCUMENT_URL, details['documents'][n]['path']) # relative to absolute
return details
def find_documents( title = '',
number = '',
......
......@@ -7,5 +7,8 @@ last_meeting = mdms.get_current_meeting()
print('\nLast MPEG#{} ({}) from {} to {}'.format(last_meeting['number'], last_meeting['name'], last_meeting['start_date'], last_meeting['end_date']))
input_docs = mdms.get_input_documents(last_meeting['id'])
print('\nNumber of input contributions: ', len(input_docs))
print("First entry: ", input_docs[0])
\ No newline at end of file
print('\nNumber of input contributions:', len(input_docs))
print('First entry:', input_docs[0])
doc_details = mdms.get_document_details(input_docs[0]['mdms_id'])
print('\nDetails of the first document:', doc_details)
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment