Source code for pycropml.transpiler.antlr_py.extract_metadata_from_comment
# inspired from http://csharphelper.com/blog/2015/10/extract-comments-from-a-c-file-in-c/
"""
information extraction over structured documentation or comments
"""
from pycropml.composition import ModelComposition
from pycropml.modelunit import ModelUnit
from pycropml.description import Description
from pycropml.inout import Input, Output
import os
import re
# Return a file's comments.
[docs]
def ExtractComments(filename, c_st_single, c_st_multi, c_end_multi):
# If a language has no block comment style, be sure that the default c_st_multi and c_end_multi will never be met.
if os.path.isfile(filename):
with open(filename, 'r', encoding='utf-8') as file:
all_text = file.read()
else: all_text = filename
comments = "";
while (len(all_text) > 0):
single_line_pos = all_text.find(c_st_single)
multi_line_pos = all_text.find(c_st_multi)
if (single_line_pos < 0) and multi_line_pos <0 : break
if (single_line_pos < 0) : single_line_pos = len(all_text)
if (multi_line_pos < 0): multi_line_pos = len(all_text)
if (single_line_pos < multi_line_pos):
end_pos = all_text.find("\n", single_line_pos +1);
if end_pos<0 :
comments += all_text[single_line_pos:] + "\r\n"
all_text = ""
else:
comments += all_text[single_line_pos: end_pos] +"\r\n"
all_text = all_text[end_pos + 1:]
else:
end_pos = all_text.find(c_end_multi, multi_line_pos + 1)
if (end_pos < 0):
comments += all_text[multi_line_pos:] + "\r\n"
all_text = ""
else:
comments += all_text[multi_line_pos : end_pos+2] + "\r\n";
all_text = all_text[end_pos + 1:]
return comments
#pattern_attr_val = r"(\*\*?\s*(?P<attribute>\w+)\s*:\s*(?P<value>[\-\(\)\w+\s:,ï\[\]\\_\./\'\*]*))"
pattern_attr_val = r"(\*\*?\s*(?P<attribute>\w+)\s*:\s*(?P<value>.*))"
def _search_group(pattern, text, group=1, flags=0, default=None):
"""Return match.group(group) or default if no match."""
m = re.search(pattern, text, flags)
return m.group(group) if m else default
import json
[docs]
def parse_default(val):
if isinstance(val, str):
s = val.strip()
if s.startswith("[") and s.endswith("]"):
try:
return json.loads(s)
except json.JSONDecodeError:
return val
return val
def _find_section(text, section_name):
"""
Return the raw body of a section like '- inputs:' up to next top-level '- <something>:'
Works even if the section is last.
"""
# top-level section markers: start of line then '-' then name then ':'
# allow optional comment prefixes like //, #, !, * and spaces
pat = rf"(?im)^[ \t*/#!-]*-\s*{re.escape(section_name)}\s*:\s*(.*?)(?=^[ \t*/#!-]*-\s*\w+\s*:|\Z)"
m = re.search(pat, text, flags=re.DOTALL)
return m.group(1) if m else None
def _split_named_items(section_body):
"""
Split a section body into chunks per '* name: ...'
Returns list of (name, body_after_name).
"""
if not section_body:
return []
# Normalize line endings
section_body = section_body.replace("\r\n", "\n").replace("\r", "\n")
# Find all occurrences of '* name: X' with their spans
item_pat = re.compile(r"(?im)^[ \t*/#!-]*\*\s*name\s*:\s*(?P<name>[^\n]+)\n?", re.MULTILINE)
matches = list(item_pat.finditer(section_body))
if not matches:
return []
items = []
for i, m in enumerate(matches):
start = m.end()
end = matches[i + 1].start() if i + 1 < len(matches) else len(section_body)
name = ensure_text(m.group("name").strip())
body = section_body[start:end].strip("\n")
items.append((name, body))
return items
[docs]
def attval(text):
"""
Parse lines like:
* Title: ...
** description : ...
Supports multiline continuation: lines without attribute are appended to last attribute.
"""
if not text:
return {}
text = text.replace("\r\n", "\n").replace("\r", "\n")
lines = [ln for ln in text.split("\n") if ln.strip()]
dic = {}
last_attr = None
for line in lines:
# Try strict match first
m = re.search(pattern_attr_val, line)
if m:
attr = m.group("attribute").strip()
val = m.group("value").strip()
val = ensure_text(val)
last_attr = attr
else:
# continuation line
if last_attr is None:
# If we don't know what to attach to, skip or store under a generic key
continue
attr = last_attr
val = ensure_text(line.strip())
# Clean common comment junk at line start
val = re.sub(r"^[ \t*/#!-]+", "", val).strip()
if attr in dic:
dic[attr] += "\n" + val
else:
dic[attr] = val
return dic
[docs]
def extract(comment):
# -------- header (safe) --------
header_patterns = {
"name": r"(?im)\bName\s*:\s*(?P<name>[^\s,]+)",
"version": r"(?im)\bVersion\s*:\s*(?P<version>[^\s,]+)",
"timestep": r"(?im)\bTime\s*step\s*:\s*(?P<timestep>[^\s,]+)",
}
head = {}
for k, pat in header_patterns.items():
m = re.search(pat, comment)
if m:
head[k] = ensure_text(m.group(k))
munit = ModelUnit(head)
# -------- description (optional) --------
desc_body = _find_section(comment, "Description")
if desc_body:
desc_dict = attval(desc_body)
d = Description()
for k, v in desc_dict.items():
setattr(d, k, v)
munit.add_description(d)
# -------- inputs (optional) --------
inputs_body = _find_section(comment, "inputs")
inpList = []
for name, body in _split_named_items(inputs_body):
data = {"name": name}
data.update(attval(body))
inpList.append(Input(data))
munit.inputs = inpList # empty if missing
# -------- outputs (optional) --------
outputs_body = _find_section(comment, "outputs")
outList = []
for name, body in _split_named_items(outputs_body):
data = {"name": name}
data.update(attval(body))
outList.append(Output(data))
munit.outputs = outList # empty if missing
return munit
[docs]
def extract_compo(comment):
keywords = ["name", "version", "timestep" ]
patterns = [r'(\s*-?\s*Name:\s*(?P<name>\w+))',
r'(\s*-?\s*Version:\s*(?P<version>\d+\.*\d+))',
r'(\s*-?\s*Time step:\s*(?P<timestep>\d+\.*\d*))']
# header of modelUnit name, version, timestep
head = {}
i = 0
for p in patterns:
if re.search(p, comment):
head[keywords[i]] = re.search(p, comment).group(keywords[i])
i = i + 1
m = ModelComposition(head)
# description element of modelUnit (Title, Authors, Reference, Institution, Abstract)
pat_description = r'-\s*Description:\s*(.*?)(?=\n\s*[#!/]*\s*-\s*inputs|\n\s*[#!/]*\s*-\s*outputs|$)'
text_description = re.search(pat_description, comment, re.DOTALL).group(1)
description = attval(text_description)
d = Description()
for k, v in description.items():
setattr(d, k, v)
m.add_description(d)
return m
"""
from pycropml.transpiler.antlr_py.extract_metadata_from_comment import ExtractComments, extract
from path import Path
file = Path("C:/Users/midingoy/Documents/SQ_Wheat_Phenology/src/f90/vernalizationprogress.f90")
r = ExtractComments(file, "!", '"""', '"""')
v = extract(r)
"""