Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in / Register
Toggle navigation
Menu
Open sidebar
Dimitri Podborski
AutomationTools
Commits
a0be21e9
Commit
a0be21e9
authored
Jan 25, 2021
by
Dimitri Podborski
Browse files
implement get_document_details
allows you to fetch more infos about a document. e.g.: Abstract
parent
3d8b701b
Changes
1
Hide whitespace changes
Inline
Side-by-side
automation/mdms.py
View file @
a0be21e9
...
...
@@ -181,6 +181,102 @@ class MDMSParser:
print
(
'Error: Could not parse input documents data. Did MDMS layout change?'
)
return
[]
return
docs
def
parse_document_details
(
self
,
html
):
details
=
{
'submitted_by'
:
None
,
'title'
:
None
,
'authors_string'
:
None
,
'organizations'
:
None
,
'abstract'
:
None
,
'related_docs'
:
None
,
'ahg'
:
None
,
'sub_group'
:
None
,
'group'
:
None
,
'standard'
:
None
,
'activity'
:
None
,
'documents'
:
[]
}
soup
=
bs4
.
BeautifulSoup
(
html
,
features
=
'lxml'
)
for
i
in
soup
.
select
(
'br'
):
# replace <br/> with a space, it makes checking headers easier
i
.
replace_with
(
' '
)
# do some checks if format is ok
table_main
=
soup
.
find
(
'body'
).
find
(
'table'
)
if
not
table_main
:
print
(
'Error: No main table element found. Did MDMS layout change?'
)
return
None
rows_main
=
table_main
.
find_all
(
'tr'
,
recursive
=
False
)
if
not
len
(
rows_main
)
==
3
:
print
(
'Error: Main table should have 3 rows. Did MDMS layout change?'
)
return
None
tables
=
rows_main
[
0
].
find_all
(
'table'
)
if
not
len
(
tables
)
==
2
:
print
(
'Error: First row in the main table should have only 2 tables in it. Did MDMS layout change?'
)
return
None
rows
=
tables
[
1
].
find_all
(
'tr'
,
recursive
=
False
)
# parse
for
n
in
range
(
len
(
rows
)):
cols
=
rows
[
n
].
find_all
(
'td'
,
recursive
=
False
)
attribute
=
cols
[
0
].
text
.
strip
().
lower
()
entry
=
cols
[
1
].
text
.
strip
()
if
len
(
entry
)
==
0
:
continue
# skip all empty fields
if
'submitted by'
in
attribute
:
parsed_href
=
urlparse
(
cols
[
1
].
a
[
'href'
])
details
[
'submitted_by'
]
=
{
'name'
:
entry
,
'email'
:
parsed_href
.
path
}
elif
'title'
in
attribute
:
details
[
'title'
]
=
entry
elif
'authors'
in
attribute
:
details
[
'authors_string'
]
=
entry
elif
'organizations'
in
attribute
:
details
[
'organizations'
]
=
entry
elif
'abstract'
in
attribute
:
details
[
'abstract'
]
=
entry
elif
'related contributions'
in
attribute
:
details
[
'related_docs'
]
=
entry
elif
'ahg'
in
attribute
:
details
[
'ahg'
]
=
entry
elif
'sub group'
in
attribute
:
details
[
'sub_group'
]
=
entry
elif
'group'
in
attribute
:
details
[
'group'
]
=
entry
elif
'standard'
in
attribute
:
details
[
'standard'
]
=
entry
elif
'activity'
in
attribute
:
details
[
'activity'
]
=
entry
elif
'document'
in
attribute
:
rel_path
=
None
version
=
None
timestamp
=
None
for
entry
in
cols
[
1
].
contents
:
if
isinstance
(
entry
,
bs4
.
Tag
):
try
:
parsed_href
=
urlparse
(
entry
[
'href'
])
rel_path
=
parsed_href
.
path
except
KeyError
:
continue
else
:
entry
=
entry
.
string
.
strip
()
if
len
(
entry
)
==
0
:
details
[
'documents'
].
append
({
'rel_path'
:
rel_path
,
'version'
:
version
,
'timestamp'
:
timestamp
})
rel_path
=
None
version
=
None
timestamp
=
None
continue
pos1
=
entry
.
find
(
'(version'
)
pos2
=
entry
.
find
(
'- date'
,
pos1
+
8
)
pos3
=
entry
.
find
(
')'
,
pos2
+
6
)
if
pos1
<
0
or
pos2
<
0
or
pos3
<
0
:
continue
version
=
int
(
entry
[
pos1
+
8
:
pos2
].
strip
())
timestamp
=
self
.
try_parsing_date
(
entry
[
pos2
+
6
:
pos3
])
return
details
def
check_table_header
(
self
,
template
,
header_row
):
'''
...
...
@@ -320,9 +416,28 @@ def get_input_documents(meeting_id, standard=Standard.ALL, subgroup=Subgroup.ALL
def
get_document_details
(
document_id
):
'''
Get more details about a docuemt.
TODO: Fire a POST request to DOCUMENT_URL and parse the result.
{'submitted_by': {'name', 'email'}, 'title', 'authors_string', 'organizations', 'abstract', 'related_docs', 'ahg', 'sub_group', 'group', 'standard', 'activity', 'documents': [{'rel_path', 'version', 'timestamp'}, ... ]}
'''
return
None
debug
=
False
# remove this after dubugging
if
not
debug
:
query
=
'?id={}'
.
format
(
document_id
)
url
=
urljoin
(
DOCUMENT_URL
,
query
)
response
=
requests
.
post
(
url
,
auth
=
(
MPEG_LOGIN
,
MPEG_PWD
))
if
not
response
.
status_code
==
200
:
print
(
'HTTP response {} != 200'
.
format
(
response
.
status_code
))
print
(
'
\t
{}'
.
format
(
response
.
text
.
replace
(
'
\n
'
,
'
\n\t
'
)))
return
[]
parser
=
MDMSParser
()
with
open
(
'input.html'
,
'w'
)
as
text_file
:
text_file
.
write
(
response
.
text
)
parser
=
MDMSParser
()
return
parser
.
parse_document_details
(
response
.
text
)
else
:
with
open
(
'input.html'
,
'r'
)
as
text_file
:
data
=
text_file
.
read
()
parser
=
MDMSParser
()
return
parser
.
parse_document_details
(
data
)
def
find_documents
(
title
=
''
,
number
=
''
,
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment