Source code for pycropml.transpiler.antlr_py.extract_metadata_from_comment

# inspired from http://csharphelper.com/blog/2015/10/extract-comments-from-a-c-file-in-c/

"""
    information extraction over structured documentation or comments

"""
from pycropml.composition import ModelComposition
from pycropml.modelunit import ModelUnit
from pycropml.description import Description
from pycropml.inout import Input, Output
import os
import re
# Return a file's comments.

[docs]
def ExtractComments(filename, c_st_single, c_st_multi, c_end_multi):
    # If a language has no block comment style, be sure that the default c_st_multi and c_end_multi will never be met.
    if os.path.isfile(filename):
        with open(filename, 'r',  encoding='utf-8') as file:
            all_text = file.read()
    else: all_text = filename
    comments = "";
    while (len(all_text) > 0):
        single_line_pos = all_text.find(c_st_single)
        multi_line_pos = all_text.find(c_st_multi)
        if (single_line_pos < 0) and multi_line_pos <0 : break
        if (single_line_pos < 0) : single_line_pos = len(all_text)
        if (multi_line_pos < 0): multi_line_pos = len(all_text)
        if (single_line_pos < multi_line_pos):
            end_pos = all_text.find("\n", single_line_pos +1);
            if end_pos<0 :
                comments += all_text[single_line_pos:] + "\r\n"
                all_text = ""            
            else:
                comments += all_text[single_line_pos: end_pos] +"\r\n"
                all_text = all_text[end_pos + 1:]          
        else:
            end_pos = all_text.find(c_end_multi, multi_line_pos + 1)
            if (end_pos < 0):
                comments += all_text[multi_line_pos:] + "\r\n"
                all_text = ""
            else:
                comments += all_text[multi_line_pos : end_pos+2] + "\r\n";
                all_text = all_text[end_pos + 1:]
    return comments


#pattern_attr_val = r"(\*\*?\s*(?P<attribute>\w+)\s*:\s*(?P<value>[\-\(\)\w+\s:,ï\[\]\\_\./\'\*]*))"
pattern_attr_val = r"(\*\*?\s*(?P<attribute>\w+)\s*:\s*(?P<value>.*))"


[docs]
def ensure_text(v):
    if isinstance(v, bytes):
        return v.decode("utf-8")
    return v


def _search_group(pattern, text, group=1, flags=0, default=None):
    """Return match.group(group) or default if no match."""
    m = re.search(pattern, text, flags)
    return m.group(group) if m else default

import json

[docs]
def parse_default(val):
    if isinstance(val, str):
        s = val.strip()
        if s.startswith("[") and s.endswith("]"):
            try:
                return json.loads(s)
            except json.JSONDecodeError:
                return val
    return val


def _find_section(text, section_name):
    """
    Return the raw body of a section like '- inputs:' up to next top-level '- <something>:'
    Works even if the section is last.
    """
    # top-level section markers: start of line then '-' then name then ':'
    # allow optional comment prefixes like //, #, !, * and spaces
    pat = rf"(?im)^[ \t*/#!-]*-\s*{re.escape(section_name)}\s*:\s*(.*?)(?=^[ \t*/#!-]*-\s*\w+\s*:|\Z)"
    m = re.search(pat, text, flags=re.DOTALL)
    return m.group(1) if m else None

def _split_named_items(section_body):
    """
    Split a section body into chunks per '* name: ...'
    Returns list of (name, body_after_name).
    """
    if not section_body:
        return []

    # Normalize line endings
    section_body = section_body.replace("\r\n", "\n").replace("\r", "\n")

    # Find all occurrences of '* name: X' with their spans
    item_pat = re.compile(r"(?im)^[ \t*/#!-]*\*\s*name\s*:\s*(?P<name>[^\n]+)\n?", re.MULTILINE)
    matches = list(item_pat.finditer(section_body))
    if not matches:
        return []

    items = []
    for i, m in enumerate(matches):
        start = m.end()
        end = matches[i + 1].start() if i + 1 < len(matches) else len(section_body)
        name = ensure_text(m.group("name").strip())
        body = section_body[start:end].strip("\n")
        items.append((name, body))
    return items


[docs]
def attval(text):
    """
    Parse lines like:
       * Title: ...
       ** description : ...
    Supports multiline continuation: lines without attribute are appended to last attribute.
    """
    if not text:
        return {}

    text = text.replace("\r\n", "\n").replace("\r", "\n")
    lines = [ln for ln in text.split("\n") if ln.strip()]

    dic = {}
    last_attr = None

    for line in lines:
        # Try strict match first
        m = re.search(pattern_attr_val, line)
        if m:
            attr = m.group("attribute").strip()
            val = m.group("value").strip()
            val = ensure_text(val)
            last_attr = attr
        else:
            # continuation line
            if last_attr is None:
                # If we don't know what to attach to, skip or store under a generic key
                continue
            attr = last_attr
            val = ensure_text(line.strip())

        # Clean common comment junk at line start
        val = re.sub(r"^[ \t*/#!-]+", "", val).strip()

        if attr in dic:
            dic[attr] += "\n" + val
        else:
            dic[attr] = val

    return dic



[docs]
def extract(comment):
    # -------- header (safe) --------
    header_patterns = {
        "name": r"(?im)\bName\s*:\s*(?P<name>[^\s,]+)",
        "version": r"(?im)\bVersion\s*:\s*(?P<version>[^\s,]+)",
        "timestep": r"(?im)\bTime\s*step\s*:\s*(?P<timestep>[^\s,]+)",
    }

    head = {}
    for k, pat in header_patterns.items():
        m = re.search(pat, comment)
        if m:
            head[k] = ensure_text(m.group(k))

    munit = ModelUnit(head)

    # -------- description (optional) --------
    desc_body = _find_section(comment, "Description")
    if desc_body:
        desc_dict = attval(desc_body)
        d = Description()
        for k, v in desc_dict.items():
            setattr(d, k, v)
        munit.add_description(d)

    # -------- inputs (optional) --------
    inputs_body = _find_section(comment, "inputs")
    inpList = []
    for name, body in _split_named_items(inputs_body):
        data = {"name": name}
        data.update(attval(body))
        inpList.append(Input(data))
    munit.inputs = inpList  # empty if missing

    # -------- outputs (optional) --------
    outputs_body = _find_section(comment, "outputs")
    outList = []
    for name, body in _split_named_items(outputs_body):
        data = {"name": name}
        data.update(attval(body))
        outList.append(Output(data))
    munit.outputs = outList  # empty if missing

    return munit






[docs]
def extract_compo(comment):
    keywords = ["name", "version", "timestep" ]
    patterns = [r'(\s*-?\s*Name:\s*(?P<name>\w+))',
                r'(\s*-?\s*Version:\s*(?P<version>\d+\.*\d+))',
                r'(\s*-?\s*Time step:\s*(?P<timestep>\d+\.*\d*))'] 
    
    # header of modelUnit name, version, timestep
    head = {}
    i = 0
    for p in patterns:
        if re.search(p, comment):
            head[keywords[i]] = re.search(p, comment).group(keywords[i])   
        i = i + 1
    m = ModelComposition(head)
    # description element of modelUnit (Title, Authors, Reference, Institution, Abstract)
    pat_description = r'-\s*Description:\s*(.*?)(?=\n\s*[#!/]*\s*-\s*inputs|\n\s*[#!/]*\s*-\s*outputs|$)'
    text_description = re.search(pat_description, comment, re.DOTALL).group(1)
    description = attval(text_description)    
    d = Description()
    for k, v in description.items(): 
        setattr(d, k, v) 
    m.add_description(d)
    return m



"""
from pycropml.transpiler.antlr_py.extract_metadata_from_comment import ExtractComments, extract
from path import Path
file = Path("C:/Users/midingoy/Documents/SQ_Wheat_Phenology/src/f90/vernalizationprogress.f90")
r = ExtractComments(file, "!", '"""', '"""') 
v = extract(r)


"""