Commit c9fd621c authored by Dimitri Podborski's avatar Dimitri Podborski
Browse files

Initial commit for mdms module

implement parsing of meetings and input documents
Close #175 #173 #182 #189
parent 1c9720d7
# Automation tools for MPEG workflow
## :warning: ATTENTION :warning:
MDMS and GitLab modules are curretnly under development. An example how to use them can be dound in `./exampleN.py`.
Currently this is just a simple **very quick hack**, but it works.
A **very quick hack** for automatic issue creaton (used for FF group during last meeting) can be found in `./hack.py`.
New ideas are welcome. Open new issues for your ideas.
The general idea is to fire requests to MDMS using curl, process the information, use gitlab API to open issues.
The general idea is to fire requests to MDMS, process the information, use gitlab API to open issues.
## Requirements
......@@ -19,7 +19,6 @@ The general idea is to fire requests to MDMS using curl, process the information
## Tools
- python3
- [curl](https://curl.se)
- ...
### Python packages / libs
......
# init automation package
\ No newline at end of file
import os
TOKEN = os.environ.get('GITLAB_TOKEN')
# TODO: implement me
\ No newline at end of file
# -*- coding: utf-8 -*-
from urllib.parse import urljoin, parse_qs, urlparse
import os
import requests
import bs4
from datetime import datetime
from enum import Enum, unique
# "curl -s -X POST -d 'id=" + docID + "&id_meeting=' -u $MPEG_LOGIN:$MPEG_PWD https://dms.mpeg.expert/doc_end_user/current_document.php"
# "curl -s -X POST -d 'search_title=&search_number=" + contrNr + "&search_category=&search_author=&search_id_group=1&search_sub_group=1&id_meeting=&submit=Search&meeting=' -u $MPEG_LOGIN:$MPEG_PWD https://dms.mpeg.expert/doc_end_user/searchAcross.php"
BASE_URL = 'https://dms.mpeg.expert/doc_end_user/'
MEETINGS_URL = urljoin(BASE_URL, 'all_meeting.php')
CURRENT_MEETING_URL = urljoin(BASE_URL, 'current_meeting.php')
SEARCH_URL = urljoin(BASE_URL, 'searchAcross.php')
DOCUMENT_URL = urljoin(BASE_URL, 'current_document.php')
MPEG_LOGIN = os.environ.get('MPEG_LOGIN')
MPEG_PWD = os.environ.get('MPEG_PWD')
class MDMSParser:
def parse_meetings(self, html):
meetings = []
soup = bs4.BeautifulSoup(html, features = 'lxml')
tables = soup.find('body').find_all('table')
if len(tables) != 1:
print('Error: Only single table should be present in "All Meetings" frame. Did layout of MDMS change?')
return []
rows = tables[0].find_all('tr')
for n in range(len(rows)):
if n==0: # check header first
header = ['number', 'name', 'start date', 'end date', 'last input document', 'last output document']
if not self.check_table_header(header, rows[n]):
print('Error: Wrong table header. Did layout of MDMS change?')
return []
continue
cols = rows[n].find_all('td', recursive=False)
try:
last_output = None
if len(cols[5].string) > 0:
last_output = int(cols[5].string)
parsed_href = urlparse(cols[1].a['href'])
meeting_id = int(parse_qs(parsed_href.query)['id_meeting'][0])
meetings.append({
'number': int(cols[0].text),
'id': meeting_id,
'name': cols[1].text.strip(),
'start_date': datetime.strptime(cols[2].text, '%Y-%m-%d'),
'end_date': datetime.strptime(cols[3].text, '%Y-%m-%d'),
'last_input': int(cols[4].text),
'last_output': last_output
})
except:
print('Error: Could not parse meeting data. Did MDMS layout change?')
return []
return meetings
def parse_input_docs(self, html):
docs = []
soup = bs4.BeautifulSoup(html, features = 'lxml')
for i in soup.select ('br'): # replace <br/> with a space, it makes checking headers easier
i.replace_with(' ')
form = soup.find('body').find('form', id='documents')
if not form:
print('Error: No form with id="documents" found. Did MDMS layout change?')
return []
table = form.find('table')
if not table:
print('Error: No table found in form. Did MDMS layout change?')
return []
rows = table.find_all('tr', recursive=False)
for n in range(len(rows)):
if n==0: # check header first
header = ['number', 'created', 'uploaded', 'Group Working Group / SubGroup', 'title', 'source', 'download']
if not self.check_table_header(header, rows[n]):
print('Error: Wrong table header. Did layout of MDMS change?')
return []
continue
cols = rows[n].find_all('td', recursive=False)
try:
if len(cols[0].text) == 0:
continue
# get document ID on MDMS
parsed_href = urlparse(cols[0].a['href'])
mdms_id = int(parse_qs(parsed_href.query)['id'][0])
# get timestamp of the last uploaded version
last_version_uploaded = None
if len(cols[2].text.strip()) > 0:
last_version_uploaded = datetime.strptime(cols[2].text.strip(), '%Y-%m-%d %H:%M:%S')
# get authors
authors = []
for entry in cols[5].contents:
if isinstance(entry, bs4.Tag):
parsed_href = urlparse(entry['href'])
authors.append({
'name': entry.text,
'email': parsed_href.path
})
else:
name = entry.string.replace(',', '').strip()
if len(name) > 0:
authors.append({
'name': entry.string.replace(',', '').strip(),
'email': ''
})
# get latest document link (if available)
latest_url = None
if len(cols) == 7:
if not cols[6].find('a') == None:
latest_url = urljoin(CURRENT_MEETING_URL, cols[6].find('a')['href'])
docs.append({
'mdms_id': mdms_id,
'document': cols[0].text,
'created': datetime.strptime(cols[1].text, '%Y-%m-%d %H:%M:%S'),
'last_version_uploaded': last_version_uploaded,
'sub_group_text': cols[3].text,
'title': cols[4].text.strip(),
'authors': authors,
'latest_version_url': latest_url
})
except: # TODO: catch properly
print('Error: Could not parse input documents data. Did MDMS layout change?')
return []
return docs
def check_table_header(self, template, header_row):
'''
Check if header_row contains the same data as the template
'''
cols = header_row.find_all('td', recursive=False)
if not len(template) == len(cols):
print('Error: Table header should have {} columns but it has {}.'.format(len(template), len(cols)))
return False
for n in range(len(template)):
if not cols[n].text.strip().lower() == template[n].strip().lower():
print('Error: Table header entry mismatch: "{}" != "{}".'.format(cols[n].text.strip().lower(), template[n].strip().lower()))
return False
return True
@unique
class SearchCategory(Enum):
ALL = ''
INPUT = 'm'
OUTPUT = 'w'
# search_id_group
@unique
class Standard(Enum):
ALL = 1
MPEG_DASH = 38
MPEG_G = 43
MPEG_IOMT = 44
MPEG_N = 49
MPEG_1 = 4
MPEG_2 = 5
MPEG_4 = 6
MPEG_7 = 7
MPEG_21 = 8
MPEG_A = 9
MPEG_B = 10
MPEG_C = 11
MPEG_D = 12
MPEG_E = 13
MPEG_M = 14
MPEG_U = 15
MPEG_V = 16
MPEG_H = 26
MPEG_UD = 39
MPEG_GREEN = 40
MPEG_I = 41
MPEG_CICP = 42
MPEG_5 = 50
EXPLORATIONS = 45
MAR_REFERENCE_MODEL = 46
# search_sub_group
@unique
class Subgroup(Enum):
ALL = 1
JVET = 41
WG2 = 43
WG3 = 44
WG4 = 45
WG5 = 46
WG6 = 47
WG7 = 48
WG8 = 49
AG2 = 50
AG3 = 51
AG4 = 52
AG5 = 53
JCTVC = 38
JCT3V = 39
# id_meeting=<meeting_id>
# type_order=0 # 0-inc, 1-decreasing order (input + output documents)
# sql_type=document_number | document_date_time | upload_document_date_time | document.id_sub_group | title | authors
def _get_query_string( meeting_id,
category=SearchCategory.INPUT,
group=Standard.ALL,
subgroup=Subgroup.ALL):
return '?id_meeting={}' \
'&search_category={}' \
'&search_id_group={}' \
'&search_sub_group={}'.format(meeting_id, category.value, group.value, subgroup.value )
# --------------------------------------------------------------------------------------------------
# Interfaces
# --------------------------------------------------------------------------------------------------
def get_meetings():
'''
Get data for all meetings. Retruns data of all meetings.
[{ 'number', 'id', 'name', 'start_date', 'end_date', 'last_input', 'last_output' }, ...]
'''
response = requests.get(MEETINGS_URL, auth=(MPEG_LOGIN, MPEG_PWD))
if not response.status_code == 200:
print('HTTP response {} != 200'.format(response.status_code))
print('\t{}'.format(response.text.replace('\n', '\n\t')))
return []
parser = MDMSParser()
return parser.parse_meetings(response.text)
def get_current_meeting():
'''
Retruns data of the latest meeeting.
{ 'number', 'id', 'name', 'start_date', 'end_date', 'last_input', 'last_output' }
'''
meetings = get_meetings()
return max(meetings, key=lambda x:x['number'])
def get_input_documents(meeting_id, standard=Standard.ALL, subgroup=Subgroup.ALL):
'''
Returns data of all input documents of a certain meeting.
[{'mdms_id', 'document', 'created', 'last_version_uploaded', 'sub_group_text', 'title', 'authors', 'latest_version_url'}, ...]
'''
debug = False # remove this after dubugging
if not debug:
query = _get_query_string(meeting_id, SearchCategory.INPUT, standard, subgroup)
url = urljoin(CURRENT_MEETING_URL, query)
response = requests.get(url, auth=(MPEG_LOGIN, MPEG_PWD))
if not response.status_code == 200:
print('HTTP response {} != 200'.format(response.status_code))
print('\t{}'.format(response.text.replace('\n', '\n\t')))
return []
with open('input.html', 'w') as text_file:
text_file.write(response.text)
parser = MDMSParser()
return parser.parse_input_docs(response.text)
else:
with open('input.html', 'r') as text_file:
data = text_file.read()
parser = MDMSParser()
return parser.parse_input_docs(data)
def get_document_details(document_id):
'''
Get more details about a docuemt.
TODO: Fire a POST request to DOCUMENT_URL and parse the result.
'''
return None
def find_documents( title = '',
number = '',
author = '',
category=SearchCategory.ALL,
group = Standard.ALL,
subgroup=Subgroup.ALL):
'''
Find documents using the search URL.
TODO: Fire a POST request to SEARCH_URL and parse the result
'''
return None
\ No newline at end of file
from automation import mdms
meetings = mdms.get_meetings()
print('Number of MPEG meetings:', len(meetings))
last_meeting = mdms.get_current_meeting()
print('\nLast MPEG#{} ({}) from {} to {}'.format(last_meeting['number'], last_meeting['name'], last_meeting['start_date'], last_meeting['end_date']))
input_docs = mdms.get_input_documents(last_meeting['id'])
print('\nNumber of input contributions: ', len(input_docs))
print("First entry: ", input_docs[0])
\ No newline at end of file
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment