Commit 952b1d18 authored by Dimitri Podborski's avatar Dimitri Podborski
Browse files

Merge branch 'refactoring' into 'master'

release 1.4

Closes #197 and #199

See merge request !5
parents 183bbcbd 4d1a3662
......@@ -46,6 +46,11 @@ Temporary Items
# Local History for Visual Studio Code
.history/
####################################
# IntelliJ
####################################
.idea/*
####################################
# Python
####################################
......
......@@ -51,32 +51,29 @@ Below are a few examples:
1. Open issues based on the information provided in a CSV file:
e.g.: `python systems.py -o --csv Contribs.csv`
2. Open issues based on CLI options:
e.g.: `python systems.py -o -m 55958,55959,56121 -p FileFormat/CENC --meeting 133`
e.g.: `python systems.py -o -m 55958,55959,56121 -p http://... --meeting 133`
3. Generate an output document based on the information provided in a CSV file. Use a template as a basis (`--template` is optional):
e.g.: `python systems.py -d --csv Contribs.csv --template template.docx`
4. Close issues based on the information provided in a CSV file:
e.g.: `python systems.py -c --csv Contribs.csv`
5. Close issues based on CLI options:
e.g.: `python systems.py -c -m m55958,m55959,m56121 -p FileFormat/CENC --meeting 133`
e.g.: `python systems.py -c -m m55958,m55959,m56121 -p http://... --meeting 133`
6. Print information about input documents on MDMS and GitLab:
e.g.: `python systems.py -l -m m55958,m55959,m56121 -p FileFormat/CENC --meeting 133`
e.g.: `python systems.py -l -m m55958,m55959,m56121 -p http://... --meeting 133`
The CSV file must have a header row with the folowing entries:
- **Number** - MPEG document number with entries like `m12345`
- To determine which GitLab project needs to be used make sure that your CSV file has either:
- **Project URL** - a full URL to your GitLab project (*recommended*)
- or **Sub Group** and **Project Name** - two last elements of the Project URL. (non-case-sensitive)
e.g.: http://mpegx.int-evry.fr/software/MPEG/Systems/PCC-SYS/V-PCC → `Sub Group=PCC-SYS`, `Project Name=V-PCC`.
e.g.: http://mpegx.int-evry.fr/software/MPEG/Systems/FileFormat/isobmff → `Sub Group=FileFormat`, `Project Name=ISOBMFF`
- **Close issue** - is required if you want to close multiple issues at once. Supported values are `0`, `1`, `TRUE`, `FALSE`, `true` and `false`.
- **Number** - MPEG document numbers
- **Project URL** - a full URL to your GitLab project
- **Close issue** - (optional) if you want to close multiple issues at once. Supported values are `0`, `1`, `TRUE`, `FALSE`, `true` and `false`.
The example CSV below has both `Project URL` and (`Sub Group` with `Project Name`) but you can also have one of these in your CSV. The CSV delimiter is determined automatically.
The CSV delimiter is determined automatically. The order of columns does not matter. CSV file example:
```csv
Number;Title;Project URL;Sub Group;Project Name
m55958;On item encryption;http://mpegx.int-evry.fr/software/MPEG/Systems/FileFormat/CENC;FileFormat;CENC
m55959;On multi-key encryption;http://mpegx.int-evry.fr/software/MPEG/Systems/FileFormat/CENC;FileFormat;CENC
Number;Whatever column;Project URL
m55958;On item encryption;http://mpegx.int-evry.fr/software/MPEG/Systems/FileFormat/CENC
m55959;On multi-key encryption;http://mpegx.int-evry.fr/software/MPEG/Systems/FileFormat/CENC
...
```
## 3. generate_ballot_issues.py
......
# -*- coding: utf-8 -*-
'''
"""
This is the interface to MPEG GitLab API.
'''
"""
import os
import gitlab
......@@ -10,6 +10,7 @@ from enum import Enum, unique
BASE_URL = 'http://mpegx.int-evry.fr/software'
TOKEN = os.environ.get('GITLAB_TOKEN')
@unique
class Label(Enum):
Accepted = 'Accepted'
......@@ -27,6 +28,7 @@ class Label(Enum):
SeeDoCR = 'SeeDoCR'
Withdrawn = 'Withdrawn'
# private token authentication
GL = gitlab.Gitlab(BASE_URL, private_token=TOKEN)
try:
......@@ -48,6 +50,7 @@ def _get_project(project_id):
return None
return project
# --------------------------------------------------------------------------------------------------
# Interfaces
# --------------------------------------------------------------------------------------------------
......@@ -68,6 +71,7 @@ def get_projects():
})
return projects_stripped
def get_members(group_id):
if not GL:
print('Error: GitLab API authentication failed.')
......@@ -79,7 +83,7 @@ def get_members(group_id):
real_group = GL.groups.get(subgroup.id, lazy=True)
members = real_group.members.all(all=True)
for member in members:
if not member.username in members_stripped:
if member.username not in members_stripped:
members_stripped[member.username] = {
'id': member.id,
'name': member.name,
......@@ -87,6 +91,7 @@ def get_members(group_id):
}
return members_stripped
def get_issues(project_id):
project = _get_project(project_id)
if not project:
......@@ -94,13 +99,17 @@ def get_issues(project_id):
issues = project.issues.list(state='opened', all=True)
return issues
def open_issue(project_id, title, description, labels=[]):
def open_issue(project_id, title, description, labels=None):
project = _get_project(project_id)
if not project:
return
if labels is None:
labels = []
issue = project.issues.create({'title': title, 'description': description, 'labels': labels})
issue.save()
def close_issue(issue):
if isinstance(issue, gitlab.v4.objects.ProjectIssue):
issue.state_event = 'close'
......
# -*- coding: utf-8 -*-
'''
"""
Some helper functions
'''
"""
import json
import os
import re
......@@ -12,28 +12,32 @@ from docx.enum.text import WD_ALIGN_PARAGRAPH # pylint: disable=E0611
OPENING_TAG = '[//]: # ( !!! ATTENTION !!! DO NOT MODIFY BEFORE AND AFTER THIS LINE)'
CLOSING_TAG = '[//]: # ( !!! ATTENTION !!! YOU CAN MODIFY AFTER THIS LINE)'
DEADLINE_DAYS = 6 # Number of days to be subtracted from Monday
class DocumentFormatter:
def __init__(self, template_path):
self.__doc = Document(docx = template_path)
self.__doc = Document(docx=template_path)
def save(self, output_path):
self.__doc.save(output_path)
# https://github.com/python-openxml/python-docx/issues/74#issuecomment-261169410
def add_hyperlink(self, paragraph, url, text):
@staticmethod
def add_hyperlink(paragraph, url, text):
part = paragraph.part
r_id = part.relate_to(url, opc.constants.RELATIONSHIP_TYPE.HYPERLINK, is_external = True)
r_id = part.relate_to(url, opc.constants.RELATIONSHIP_TYPE.HYPERLINK, is_external=True)
hyperlink = oxml.shared.OxmlElement('w:hyperlink')
hyperlink.set(oxml.shared.qn('r:id'), r_id, )
run = oxml.shared.OxmlElement('w:r')
rPr = oxml.shared.OxmlElement('w:rPr')
r_pr = oxml.shared.OxmlElement('w:rPr')
c = oxml.shared.OxmlElement('w:color')
c.set(oxml.shared.qn('w:val'), '0000EE')
rPr.append(c)
r_pr.append(c)
u = oxml.shared.OxmlElement('w:u')
u.set(oxml.shared.qn('w:val'), 'single')
rPr.append(u)
run.append(rPr)
r_pr.append(u)
run.append(r_pr)
run.text = text
hyperlink.append(run)
paragraph._p.append(hyperlink)
......@@ -44,7 +48,6 @@ class DocumentFormatter:
project_name = project['name']
h = self.__doc.add_heading('', 2)
self.add_hyperlink(h, project_url, project_name)
p = None
if len(project_description) > 0:
p = self.__doc.add_paragraph(project_description)
else:
......@@ -63,7 +66,7 @@ class DocumentFormatter:
h.add_run(' ' + document['title'])
# Create a 4x2 table with all borders
table = self.__doc.add_table(rows = 4, cols = 2)
table = self.__doc.add_table(rows=4, cols=2)
table.style = 'Table Grid'
# Set the text of all the cells
......@@ -85,7 +88,8 @@ class DocumentFormatter:
p = table.rows[2].cells[1].add_paragraph()
self.add_hyperlink(p, issue_title.web_url, issue_title.references['full'])
else:
self.add_hyperlink(table.rows[2].cells[1].paragraphs[0], issue_title.web_url, issue_title.references['full'])
self.add_hyperlink(table.rows[2].cells[1].paragraphs[0], issue_title.web_url,
issue_title.references['full'])
issues_added += 1
table.rows[3].cells[0].text = 'Disposition'
......@@ -98,21 +102,23 @@ class DocumentFormatter:
p = self.__doc.add_paragraph('<minutes>')
p.paragraph_format.space_before = shared.Pt(8)
def is_document_late(meeting_start, v1_upload_timestamp):
'''
"""
meeting_start and v1_upload_timestamp shall be datetime objects
'''
"""
meeting_start = meeting_start.replace(hour=0, minute=0, second=0) # paranoia
deadline = meeting_start - timedelta(days=4) # End of Wed.
deadline = meeting_start - timedelta(days=DEADLINE_DAYS)
diff = deadline - v1_upload_timestamp
if diff.total_seconds() <= 0:
return True
return False
def try_parsing_date(text):
'''
"""
Try parsing the timestamp, if not possible return None
'''
"""
for fmt in ('%Y-%m-%d %H:%M:%S', '%Y-%m-%d'):
try:
return datetime.strptime(text.strip(), fmt)
......@@ -120,37 +126,41 @@ def try_parsing_date(text):
pass
return None
def load_json_data(json_path):
'''
"""
Load json file from json_path and return the data.
'''
"""
with open(json_path, 'r', encoding='utf-8') as f:
data = json.load(f)
return data
def store_json_data(json_path, data):
'''
"""
Store data as a json file to json_path. datetime objects are stored as strings.
'''
"""
dir_name = os.path.dirname(json_path)
if not os.path.exists(dir_name) and len(dir_name) > 0:
os.makedirs(dir_name)
with open(json_path, 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=2, default=str)
def find_meeting(meetings, meeting_number):
'''
"""
Find and return a meeting using the meeting_number. If meeting_number < 0 return the latest meeting.
'''
"""
if len(meetings) == 0:
return None
if meeting_number < 0:
return max(meetings, key=lambda x:x['number'])
return max(meetings, key=lambda x: x['number'])
for meeting in meetings:
if meeting['number'] == meeting_number:
return meeting
return None
def find_document(documents, document_number):
striped_doc_nr = document_number.replace(' ', '').strip().lower()
for doc in documents:
......@@ -158,21 +168,20 @@ def find_document(documents, document_number):
return doc
return None
def find_project(projects, url_or_path, path_root = 'MPEG/'):
'''
Search for gitlab project based on URL or path_with_namespace.
'''
def find_project(projects, url_or_path):
"""
Search for gitlab project based on URL.
"""
if url_or_path is None:
return None
striped_url_or_path = url_or_path.replace(' ', '').strip().strip('/')
for project in projects:
if striped_url_or_path == project['url']:
return project
path_ns = project['path_with_namespace'].lower()
if striped_url_or_path.lower() in path_ns and path_ns.startswith(path_root.lower()):
if project['url'] in striped_url_or_path:
return project
return None
def find_issue(issues, document):
title_only_hit = None
metadata_hit = None
......@@ -180,7 +189,7 @@ def find_issue(issues, document):
for issue in issues:
if document['document'] in issue.title:
meta = get_issue_metadata(issue.description)
if meta == None:
if meta is None:
title_only_hit = issue
else:
if int(meta['mdms_id']) == document['mdms_id']:
......@@ -188,40 +197,44 @@ def find_issue(issues, document):
if len(meta['version']) > 0:
last_version = int(meta['version'])
else:
print('WARNING. We found a GitLab issue with the document number in the title and with metadata tag in description. But the metadata tag has wrong document id in it.')
print('WARNING. We found a GitLab issue with the document number in the title and with metadata '
'tag in description. But the metadata tag has wrong document id in it.')
return title_only_hit, metadata_hit, last_version
def get_issue_metadata(description):
'''
"""
Find and parse the metada from the description of the issue
'''
"""
pattern = '[meta]: # ('
pos1 = description.find(pattern)
if pos1 < 0:
return None
pos2 = description.find(')', pos1 + len(pattern))
meta_str = description[pos1+len(pattern):pos2]
meta_str = description[pos1 + len(pattern):pos2]
meta = meta_str.split(',')
if len(meta) != 4:
return None
return {'mdms_id': meta[0], 'document': meta[1], 'title': meta[2], 'version': meta[3]}
def create_issue_metadata(document, details):
'''
"""
Create a metadata tag
'''
"""
version = ''
if len(details['documents']) > 0:
last_doc = max(details['documents'], key=lambda x:x['version'])
last_doc = max(details['documents'], key=lambda x: x['version'])
version = str(last_doc['version'])
title = document['title'].replace('(', '').replace(')', '').replace(',', '')
meta = '[meta]: # ({},{},{},{})'.format(document['mdms_id'], document['document'], title, version)
return meta
def create_issue_description_header(document, details):
'''
"""
Create issue description header, with metadata and the table
'''
"""
description = OPENING_TAG + '\n'
description += create_issue_metadata(document, details)
description += '\n\n| Container | Company | Authors | Document |\n'
......@@ -236,31 +249,50 @@ def create_issue_description_header(document, details):
description += CLOSING_TAG
return description
def create_issue_description(document, details):
'''
"""
Create the description of the issue: metadata, table, abstract
'''
"""
description = create_issue_description_header(document, details)
description += '\n\n### Abstract\n'
# request abstract only if submitter is known and not 'Secretariat'
request_abstract = True
if details['submitted_by'] is None:
request_abstract = False
elif 'secretariat' in details['submitted_by']['name'].lower().strip():
request_abstract = False
if details['abstract']:
description += '\n\n### Abstract\n'
description += details['abstract']
else:
elif request_abstract:
description += '\n\n### Abstract\n'
description += '* [ ] please **add your abstract here**.\n'
description += '* [ ] please also **add your abstract to MDMS** (this can be used when we create the output document).\n'
description += '* [ ] please also **add your abstract to MDMS** (this can be used when we create the output ' \
'document).\n '
description += '\n\n_automatically generated issue_'
return description
def create_issue_title(document):
return document['document'].strip() + ' ' + document['title'].strip()
title = document['document'].strip() + ' ' + document['title'].strip()
if len(title) > 255:
print('WARNING: The Title of the document {} is too long. GitLab only accepts max of '
'255 characters'.format(document['document']))
title = title[0:255]
return title
def get_updated_issue_description(current_decription, document, details):
pos1 = current_decription.find(CLOSING_TAG)
if pos1 < 0:
return None
description = create_issue_description_header(document, details)
description += current_decription[pos1+len(CLOSING_TAG):]
description += current_decription[pos1 + len(CLOSING_TAG):]
return description
def find_gitlab_users(gitlab_users, document):
usernames = []
regex = re.compile(r'[^a-zA-Z\s]')
......@@ -268,11 +300,11 @@ def find_gitlab_users(gitlab_users, document):
for author in document['authors']:
author_name = author['name'].lower().strip()
author_name = regex.sub('', author_name) # remove non alphabetic chars
author_name = ' '.join( [w for w in author_name.split() if len(w)>1] ) # remove single letters
author_name = ' '.join([w for w in author_name.split() if len(w) > 1]) # remove single letters
for key in gitlab_users:
gl_name = gitlab_users[key]['name'].lower().strip()
gl_name = regex.sub('', gl_name) # remove non alphabetic chars
gl_name = ' '.join( [w for w in gl_name.split() if len(w)>1] ) # remove single letters
gl_name = ' '.join([w for w in gl_name.split() if len(w) > 1]) # remove single letters
if author_name == gl_name:
usernames.append(key)
except:
......
# -*- coding: utf-8 -*-
'''
"""
This is the interface to MPEG Document Management System (mdms).
It requests the data from MDMS and parses the output HTML
'''
"""
import os
import requests
......@@ -21,10 +21,11 @@ DOCUMENT_URL = urljoin(BASE_URL, 'current_document.php')
MPEG_LOGIN = os.environ.get('MPEG_LOGIN')
MPEG_PWD = os.environ.get('MPEG_PWD')
class MDMSParser:
def parse_meetings(self, html):
meetings = []
soup = bs4.BeautifulSoup(html, features = 'lxml')
soup = bs4.BeautifulSoup(html, features='lxml')
tables = soup.find('body').find_all('table')
if len(tables) != 1:
print('Error: Only single table should be present in "All Meetings" frame. Did layout of MDMS change?')
......@@ -32,7 +33,7 @@ class MDMSParser:
rows = tables[0].find_all('tr')
for n in range(len(rows)):
if n==0: # check header first
if n == 0: # check header first
header = ['number', 'name', 'start date', 'end date', 'last input document', 'last output document']
if not self.check_table_header(header, rows[n]):
print('Error: Wrong table header. Did layout of MDMS change?')
......@@ -65,30 +66,32 @@ class MDMSParser:
return []
return meetings
def parse_author_entry(self, author_entry):
'''
@staticmethod
def parse_author_entry(author_entry):
"""
Search entry string for an email, remove it from the name and clean up
Return a tuple('name', 'email')
'''
"""
author_entry = author_entry.strip()
if len(author_entry) == 0:
return None
email = None
match = re.search(r'[\w\.-]+@[\w\.-]+', author_entry)
match = re.search(r'[\w.-]+@[\w.-]+', author_entry)
if match: # email found
email = match.group(0)
author_entry = author_entry.replace(email, '') # remove email from the name
# remove everything what is inside () or []
author_entry = re.sub(r'[\(\[].*?[\)\]]', '', author_entry)
# remove all non ASCII characters
author_entry = re.sub(r'[^\x00-\x7F]+', '', author_entry)
author_entry = re.sub(r'[(\[].*?[)\]]', '', author_entry)
# remove everything which is not a letter and space
author_entry = re.sub(r'[^a-zA-Z\s]+', '', author_entry)
author_entry = author_entry.strip()
return (author_entry, email)
return author_entry, email
def try_parsing_date(self, text):
'''
@staticmethod
def try_parsing_date(text):
"""
Try parsing the timestamp, if not possible return None
'''
"""
for fmt in ('%Y-%m-%d %H:%M:%S', '%Y-%m-%d'):
try:
return datetime.strptime(text.strip(), fmt)
......@@ -96,25 +99,25 @@ class MDMSParser:
pass
return None
def parse_input_docs(self, html):
def parse_search_docs(self, html):
docs = []
soup = bs4.BeautifulSoup(html, features = 'lxml')
for i in soup.select ('br'): # replace <br/> with a space, it makes checking headers easier
soup = bs4.BeautifulSoup(html, features='lxml')
for i in soup.select('br'): # replace <br/> with a space, it makes checking headers easier
i.replace_with(' ')
form = soup.find('body').find('form', id='documents')
if not form:
print('Error: No form with id="documents" found. Did MDMS layout change?')
return []
table = form.find('table')
if not table:
print('Error: No table found in form. Did MDMS layout change?')
return []
table_main = form.find('table')
if not table_main:
print('Error: No main table element found. Did MDMS layout change?')
return None
rows = table_main.find_all('tr', recursive=False)
rows = table.find_all('tr', recursive=False)
for n in range(len(rows)):
if n==0: # check header first
header = ['number', 'created', 'uploaded', 'Group Working Group / SubGroup', 'title', 'source', 'download']
if n == 0: # check header first
header = ['number', 'meeting', 'created', 'Uploaded', 'Group SubGroup', 'title', 'Author(s)', '']
if not self.check_table_header(header, rows[n]):
print('Error: Wrong table header. Did layout of MDMS change?')
return []
......@@ -130,16 +133,39 @@ class MDMSParser:
container_url = urljoin(DOCUMENT_URL, '?id={}'.format(mdms_id))
# get timestamp of the last uploaded version
last_version_uploaded = self.try_parsing_date(cols[2].text)
created_timestamp = self.try_parsing_date(cols[1].text)
created_timestamp = self.try_parsing_date(cols[2].text)
uploaded_timestamp = self.try_parsing_date(cols[3].text)
# get authors
authors = self._get_authors(cols[6])
# get latest document link (if available)
latest_url = None
if len(cols) == 8:
if not cols[7].find('a') is None:
latest_url = urljoin(CURRENT_MEETING_URL, cols[7].find('a')['href'])
docs.append({
'mdms_id': mdms_id,
'document': cols[0].text,
'meeting': cols[1].text,
'created': created_timestamp,
'last_version_uploaded': uploaded_timestamp,
'sub_group_text': cols[4].text,
'title': cols[5].text.strip(),
'authors': authors,
'latest_version_url': latest_url,
'container': container_url
})
except: # TODO: catch properly
print('Error: Could not parse search documents data. Did MDMS layout change?')
return []
return docs
def _get_authors(self, col):
authors = []
for entry in cols[5].contents:
for entry in col.contents:
if isinstance(entry, bs4.Tag):
parsed_href = entry.text
email = None
name = None
try:
parsed_href = urlparse(entry['href'])
email = parsed_href.path
......@@ -148,7 +174,7 @@ class MDMSParser:
if author_data:
name = author_data[0] # clean version of the name
# sometimes people type name and email wrong in MDMS and they are flipped
if not '@' in email and author_data[1]:
if '@' not in email and author_data[1]:
name = email