Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in / Register
Toggle navigation
Menu
Open sidebar
Dimitri Podborski
AutomationTools
Commits
f736ec56
Commit
f736ec56
authored
Jan 21, 2021
by
Dimitri Podborski
😂
Browse files
Merge branch 'ff-victims' into 'master'
Ff victims See merge request
!3
parents
1acb527d
f0dada7b
Changes
2
Hide whitespace changes
Inline
Side-by-side
automation/mdms.py
View file @
f736ec56
...
...
@@ -5,10 +5,7 @@ import requests
import
bs4
from
datetime
import
datetime
from
enum
import
Enum
,
unique
# "curl -s -X POST -d 'id=" + docID + "&id_meeting=' -u $MPEG_LOGIN:$MPEG_PWD https://dms.mpeg.expert/doc_end_user/current_document.php"
# "curl -s -X POST -d 'search_title=&search_number=" + contrNr + "&search_category=&search_author=&search_id_group=1&search_sub_group=1&id_meeting=&submit=Search&meeting=' -u $MPEG_LOGIN:$MPEG_PWD https://dms.mpeg.expert/doc_end_user/searchAcross.php"
import
re
BASE_URL
=
'https://dms.mpeg.expert/doc_end_user/'
MEETINGS_URL
=
urljoin
(
BASE_URL
,
'all_meeting.php'
)
...
...
@@ -60,6 +57,37 @@ class MDMSParser:
return
[]
return
meetings
def
parse_author_entry
(
self
,
author_entry
):
'''
Search entry string for an email, remove it from the name and clean up
Return a tuple('name', 'email')
'''
author_entry
=
author_entry
.
strip
()
if
len
(
author_entry
)
==
0
:
return
None
email
=
None
match
=
re
.
search
(
r
'[\w\.-]+@[\w\.-]+'
,
author_entry
)
if
match
:
# email found
email
=
match
.
group
(
0
)
author_entry
=
author_entry
.
replace
(
email
,
''
)
# remove email from the name
# remove everything what is inside () or []
author_entry
=
re
.
sub
(
r
'[\(\[].*?[\)\]]'
,
''
,
author_entry
)
# remove all non ASCII characters
author_entry
=
re
.
sub
(
r
'[^\x00-\x7F]+'
,
''
,
author_entry
)
author_entry
=
author_entry
.
strip
()
return
(
author_entry
,
email
)
def
try_parsing_date
(
self
,
text
):
'''
Try parsing the timestamp, if not possible return None
'''
for
fmt
in
(
'%Y-%m-%d %H:%M:%S'
,
'Y-%m-%d'
):
try
:
return
datetime
.
strptime
(
text
.
strip
(),
fmt
)
except
ValueError
:
pass
return
None
def
parse_input_docs
(
self
,
html
):
docs
=
[]
soup
=
bs4
.
BeautifulSoup
(
html
,
features
=
'lxml'
)
...
...
@@ -93,26 +121,45 @@ class MDMSParser:
mdms_id
=
int
(
parse_qs
(
parsed_href
.
query
)[
'id'
][
0
])
# get timestamp of the last uploaded version
last_version_uploaded
=
None
if
len
(
cols
[
2
].
text
.
strip
())
>
0
:
last_version_uploaded
=
datetime
.
strptime
(
cols
[
2
].
text
.
strip
(),
'%Y-%m-%d %H:%M:%S'
)
last_version_uploaded
=
self
.
try_parsing_date
(
cols
[
2
].
text
)
created_timestamp
=
self
.
try_parsing_date
(
cols
[
1
].
text
)
# get authors
authors
=
[]
for
entry
in
cols
[
5
].
contents
:
if
isinstance
(
entry
,
bs4
.
Tag
):
parsed_href
=
urlparse
(
entry
[
'href'
])
parsed_href
=
entry
.
text
email
=
None
name
=
None
try
:
parsed_href
=
urlparse
(
entry
[
'href'
])
email
=
parsed_href
.
path
author_data
=
self
.
parse_author_entry
(
entry
.
text
)
name
=
entry
.
text
if
author_data
:
name
=
author_data
[
0
]
# clean version of the name
# sometimes people type name and email wrong in MDMS and they are flipped
if
not
'@'
in
email
and
author_data
[
1
]:
name
=
email
email
=
author_data
[
1
]
except
KeyError
:
# sometimes Author's field is formatted with fake html tags.
print
(
'Bad HTML format in Authors field: '
,
entry
)
name
=
entry
.
text
pass
authors
.
append
({
'name'
:
entry
.
text
,
'email'
:
parsed_href
.
path
'name'
:
name
,
'email'
:
email
})
else
:
name
=
entry
.
string
.
replace
(
','
,
''
).
strip
()
if
len
(
name
)
>
0
:
authors
.
append
({
'name'
:
entry
.
string
.
replace
(
','
,
''
).
strip
(),
'email'
:
''
})
for
author_entry
in
entry
.
string
.
replace
(
' and '
,
','
).
split
(
','
):
author_data
=
self
.
parse_author_entry
(
author_entry
)
if
author_data
:
authors
.
append
({
'name'
:
author_data
[
0
],
'email'
:
author_data
[
1
]
})
# get latest document link (if available)
latest_url
=
None
...
...
@@ -123,7 +170,7 @@ class MDMSParser:
docs
.
append
({
'mdms_id'
:
mdms_id
,
'document'
:
cols
[
0
].
text
,
'created'
:
d
atetime
.
st
rptime
(
cols
[
1
].
text
,
'%Y-%m-%d %H:%M:%S'
)
,
'created'
:
cre
ate
d_
timest
amp
,
'last_version_uploaded'
:
last_version_uploaded
,
'sub_group_text'
:
cols
[
3
].
text
,
'title'
:
cols
[
4
].
text
.
strip
(),
...
...
find_contributors.py
0 → 100644
View file @
f736ec56
from
automation
import
mdms
'''
This is an example how to get a list of authors who had input documents
with a specific substring in the title
'''
# meetings with the smaller number will be ignored
LAST_MEETING_NUMBER
=
42
TITLE_WHITELIST
=
[
'ISOBMFF'
,
'BMFF'
,
'14496-12'
,
'file format'
,
'MP4'
,
'ISOBMF'
,
'ISO Base'
]
authors
=
{}
def
document_is_relevant
(
title
,
filter_strings
):
return
any
(
sub
.
lower
().
strip
()
in
title
.
lower
().
strip
()
for
sub
in
filter_strings
)
meetings
=
mdms
.
get_meetings
()
for
meeting
in
meetings
:
if
meeting
[
'number'
]
<
LAST_MEETING_NUMBER
:
continue
print
(
'process meegting'
,
meeting
[
'number'
])
input_docs
=
mdms
.
get_input_documents
(
meeting
[
'id'
])
for
doc
in
input_docs
:
if
not
document_is_relevant
(
doc
[
'title'
],
TITLE_WHITELIST
):
continue
for
author
in
doc
[
'authors'
]:
if
author
[
'name'
]
in
authors
:
authors
[
author
[
'name'
]].
append
(
doc
)
else
:
authors
[
author
[
'name'
]]
=
[
doc
]
# print author names and number of contributions
for
author
in
authors
:
print
(
author
,
' :'
,
len
(
authors
[
author
]))
\ No newline at end of file
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment