Source code for ldsnotes.content

import requests
import html.parser
import re

H = html.parser.HTMLParser()
CONTENT = "https://www.churchofjesuschrist.org/content/api/v2"


def clean_html(text):
    """Takes in html code and cleans it. Note that footnotes
    are replaced with # for word counting later.

    Parameters
    -----------
        text : string
            html to clean

    Returns
    --------
        text : string
            cleaned text"""

    # convert all html characters
    text = html.unescape(text)
    # footnotes followed by punctuation make the punctuation be counted as a
    # word... sigh.
    punc_footnotes = re.compile(
        r'<sup class=\"marker\">\w</sup>(\w*)</a>([!?.,])')
    text = re.sub(punc_footnotes, r'#\1#\2', text)
    # remove footnotes (also counts as words)
    no_footnotes = re.compile(r'<sup class=\"marker\">\w</sup>')
    text = re.sub(no_footnotes, '#', text)
    # remove rest of html tags
    clean = re.compile('<.*?>')
    text = re.sub(clean, '', text)
    # remove peksy leftover
    return text.replace(u'\xa0', u' ')


[docs]class Content: """Class that pulls/represents content from anywhere on churchofjesuschrist.org/study (theoretically) Parameters ---------- json : dict Dictionary made from json pull from lds.org's API. Attributes ----------- content : string Book, talk, or section of content. headline : string The content (see above) with verse number in case of scriptures. publication : string Overarching publication. Think BoM, DoC, General Conference 2020, etc. url : string URL of where the content is located (including the paragraph/verse). uri : string URI that it was pulled with. p_start : string First verse/paragraph pulled. p_end : string Last verse/paragraph pulled. """ def __init__(self, json): # actual text self.sep_content = [] for j in json['content']: self.sep_content.append(clean_html(j['markup'])) self.content = "\n".join(self.sep_content).replace("#", "") # name of article ie name of conference talk or Helaman 3 self.headline = json['headline'] # full reference for scriptures like Helaman 3:29 self.reference = json['referenceURIDisplayText'] # refers to book (ie GC 2020, or BOM) self.publication = json['publication'] # uri it was pulled with self.uri = json['uri'] # paragraph or verse #'s self.p_start = int(json['content'][0]['id'][1:]) self.p_end = int(json['content'][-1]['id'][1:]) lang = json['uri'].split('/')[1] self.url = "https://www.churchofjesuschrist.org/study/" + \ "/".join(json['uri'].split('/')[2:]) + "?lang=" + lang def __print__(self): return self.content __repr__ = __print__ @staticmethod def fetch(uris, json=False): """Method to actually make content. This is where the magic happens. Requires a proper URI to fetch content. Parameters ---------- uris : list List of URIs to pull from lds.org. See below for example. json : bool Whether to return as list of Content objects or the raw dictionaries. Most useful in debugging. Defaults to False. Returns -------- Either a list of Content objects, or a list of strings. Examples --------- >>> Content.fetch(["/eng/scriptures/bofm/hel/3.p29"]) [29 Yea, we see that whosoever will may lay hold upon the word of God, which is quick and powerful, which shall divide asunder all the cunning and the snares and the wiles of the devil, and lead the man of Christ in a strait and narrow course across that everlasting gulf of misery which is prepared to engulf the wicked—] >>> Content.fetch(["/eng/scriptures/bofm/hel/3.p29"], json=True) [{'content': [{'displayId': '29', 'id': 'p29', 'markup': '<p class="verse" data-aid="128356897" id="p29"><span ' 'class="verse-number">29 </span>Yea, we see that ' 'whosoever will may lay hold upon the <a ' 'class="study-note-ref" href="#note29a"><sup ' 'class="marker">a</sup>word</a> of God, which is <a ' 'class="study-note-ref" href="#note29b"><sup ' 'class="marker">b</sup>quick</a> and powerful, which ' 'shall <a class="study-note-ref" href="#note29c"><sup ' 'class="marker">c</sup>divide</a> asunder all the ' 'cunning and the snares and the wiles of the devil, ' 'and lead the man of Christ in a strait and <a ' 'class="study-note-ref" href="#note29d"><sup ' 'class="marker">d</sup>narrow</a> course across that ' 'everlasting <a class="study-note-ref" ' 'href="#note29e"><sup class="marker">e</sup>gulf</a> ' 'of misery which is prepared to engulf the ' 'wicked&#x2014;</p>'}], 'headline': 'Helaman 3', 'image': {}, 'publication': 'Book of Mormon', 'referenceURI': '/eng/scriptures/bofm/hel/3.p29?lang=eng#p29', 'referenceURIDisplayText': 'Helaman 3:29', 'type': 'chapter', 'uri': '/eng/scriptures/bofm/hel/3.p29'}] """ # noqa: E501 resp = requests.post(url=CONTENT, data={"uris": uris}).json() if json: return [resp[u] for u in uris] else: return [Content(resp[u]) for u in uris]