- Remove adapter_dwiggiecom.patch … it really doesn't work.

OBS-URL: https://build.opensuse.org/package/show/devel:languages:python/python-fanficfare?expand=0&rev=101
2023-03-10 12:11:40 +00:00 · 2023-03-10 12:11:40 +00:00 · eb863efcb7
commit eb863efcb7
parent dd9f8c6b5e
3 changed files with 5 additions and 416 deletions
--- a/adapter_dwiggiecom.patch
+++ b/adapter_dwiggiecom.patch
@ -1,413 +0,0 @@
 From 45c6d71f57aefc3b63f2a4253eea3f730b76c6fb Mon Sep 17 00:00:00 2001
 From: =?UTF-8?q?Mat=C4=9Bj=20Cepl?= <mcepl@cepl.eu>
 Date: Wed, 15 Feb 2023 07:38:13 +0100
 Subject: [PATCH] Add adapter_dwiggiecom, which however will not be ever pushed
 upstream.
 ---
 fanficfare/adapters/__init__.py           |    1 
 fanficfare/adapters/adapter_dwiggiecom.py |  384 ++++++++++++++++++++++++++++++
 2 files changed, 385 insertions(+)
 create mode 100644 fanficfare/adapters/adapter_dwiggiecom.py
 Index: FanFicFare-4.20.0/fanficfare/adapters/__init__.py
 ===================================================================
 --- FanFicFare-4.20.0.orig/fanficfare/adapters/__init__.py
 +++ FanFicFare-4.20.0/fanficfare/adapters/__init__.py
@@ -160,6 +160,7 @@ from . import adapter_psychficcom
 from . import adapter_deviantartcom
 from . import adapter_merengohu
 from . import adapter_readonlymindcom
 +from . import adapter_dwiggiecom
 ## This bit of complexity allows adapters to be added by just adding
 ## importing.  It eliminates the long if/else clauses we used to need
 Index: FanFicFare-4.20.0/fanficfare/adapters/adapter_dwiggiecom.py
 ===================================================================
 --- /dev/null
 +++ FanFicFare-4.20.0/fanficfare/adapters/adapter_dwiggiecom.py
@@ -0,0 +1,384 @@
 +# -*- coding: utf-8 -*-
 +
 +# DO NOT PROPOSE TO MERGE! THERE ARE MANY GOOD REASONS WHY DWIGGIE IS
 +# AMONG
 +# https://github.com/JimmXinu/FanFicFare/wiki/Supportedsites#sites-not-supported
 +# See also https://github.com/JimmXinu/FanFicFare/issues/903
 +
 +# Copyright 2011 Fanficdownloader team
 +#
 +# Licensed under the Apache License, Version 2.0 (the "License");
 +# you may not use this file except in compliance with the License.
 +# You may obtain a copy of the License at
 +#
 +#     http://www.apache.org/licenses/LICENSE-2.0
 +#
 +# Unless required by applicable law or agreed to in writing, software
 +# distributed under the License is distributed on an "AS IS" BASIS,
 +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 +# See the License for the specific language governing permissions and
 +# limitations under the License.
 +#
 +
 +import logging
 +import re
 +
 +from ..htmlcleanup import stripHTML
 +from .. import exceptions as exceptions
 +from ..six.moves.urllib.error import HTTPError
 +
 +from .base_adapter import BaseSiteAdapter,  makeDate
 +
 +logger = logging.getLogger(__name__)
 +
 +
 +def getClass():
 +    return DwiggieComAdapter
 +
 +# Class name has to be unique.  Our convention is camel case the
 +# sitename with Adapter at the end.  www is skipped.
 +
 +
 +class DwiggieComAdapter(BaseSiteAdapter):
 +
 +    def __init__(self, config, url):
 +        BaseSiteAdapter.__init__(self, config, url)
 +
 +#         1252 is a superset of iso-8859-1.  Most sites that claim to be
 +#         iso-8859-1 (and some that claim to be utf8) are really windows-1252.
 +        self.decode = ["Windows-1252", "utf8"]
 +
 +#         if left empty, site doesn't return any message at all.
 +        self.username = "NoneGiven"
 +        self.password = ""
 +        self.is_adult = False
 +        self.sectionUrl = ""
 +        self.section = []
 +        self.chapters = dict()
 +
 +
 +#        # get storyId from url--url validation guarantees query is only
 +#        # sid=1234
 +#        self.story.setMetadata('storyId',self.parsedUrl.query.split('=',)[1])
 +#        logger.debug("storyId: (%s)"%self.story.getMetadata('storyId'))
 +
 +#         get storyId from url--url validation guarantees query correct
 +        m = re.match(self.getSiteURLPattern(), url)
 +        if m:
 +            self.story.setMetadata('storyId', m.group('id'))
 +            logger.debug("storyId: (%s)" % self.story.getMetadata('storyId'))
 +            # normalized story URL.
 +            self._setURL('https://www.' + self.getSiteDomain() +
 +                         '/derby/'+self . story.getMetadata('storyId')+'.htm')
 +        else:
 +            raise exceptions.InvalidStoryURL(url,
 +                                             self.getSiteDomain(),
 +                                             self.getSiteExampleURLs())
 +
 +#         Each adapter needs to have a unique site abbreviation.
 +        self.story.setMetadata('siteabbrev', 'dwg')
 +
 +#         The date format will vary from site to site.
 +#         http://docs.python.org/library/datetime.html#strftime-strptime-behavior
 +        self.dateformat = "%m/%d/%y"
 +
 +    @staticmethod  # must be @staticmethod, don't remove it.
 +    def getSiteDomain():
 +        # The site domain.  Does have www here, if it uses it.
 +        return 'dwiggie.com'
 +
 +    @classmethod
 +    def getAcceptDomains(cls):
 +        return ['www.dwiggie.com', 'dwiggie.com', 'thedwg.com', 'TheDWG.com']
 +
 +    def getSiteExampleURLs(self):
 +        return "https://"+self.getSiteDomain()+"/derby/name1b.htm"
 +
 +    def getSiteURLPattern(self):
 +        # https://www.dwiggie.com/derby/mari17b.htm
 +        return r"https?://(www.)?(thedwg|TheDWG|dwiggie)\.com/derby/(?P<id>(old_\d{4}\/|old[a-z]\/)?[a-z]+\d+)(?P<part>[a-z]*)\.htm$"
 +
 +    def tryArchivePage(self, url):
 +        try:
 +            data = self.get_request(url)
 +
 +        except HTTPError as e:
 +            if e.code == 404:
 +                # need to change the exception returned
 +                raise exceptions.StoryDoesNotExist(self.meta)
 +            else:
 +                raise e
 +
 +        archivesoup = self.make_soup(data)
 +        m = re.compile(r"/derby/" +
 +                       self.story.getMetadata('storyId')+"[a-z]?.htm$")
 +#        print(m.pattern)
 +#        print(archivesoup)
 +        a = archivesoup.find('a', href=m)
 +
 +        return a
 +
 +    def getGenre(self, url):
 +        if re.search('id=E', url):
 +            genre = 'Epilogue Abbey'
 +        else:
 +            genre = 'Fantasia Gallery'
 +        self.story.addToList('genre', genre)
 +
 +    def getItemFromArchivePage(self):
 +
 +        urls = ["https://www.dwiggie.com/toc/index.php?id=E&page=all&comp=n",
 +                "https://www.dwiggie.com/toc/index.php?id=F&page=all&comp=n"]
 +        for url in urls:
 +            a = self.tryArchivePage(url)
 +            if a is not None:
 +                self.getGenre(url)
 +                return a.parent
 +        else:
 +            return None
 +
 +    def getMetaFromSearch(self):
 +
 +        params = {}
 +        params['title_name'] = self.story.getMetadata('title')
 +
 +        searchUrl = "https://" + self.getSiteDomain() + "/toc/search.php"
 +
 +        d = self._postUrl(searchUrl, params)
 +#        print(d)
 +
 +        searchsoup = self.make_soup(d)
 +        m = re.compile(r"/derby/" + self.story.getMetadata('storyId') +
 +                       "[a-z]?.htm$")
 +#        print(m.pattern)
 +#        print(self.story.getMetadata('storyId'))
 +        a = searchsoup.find('a', href=m)
 +
 +        return a
 +
 +    def getChaptersFromPage(self, url):
 +        try:
 +            data = self.get_request(url)
 +        except HTTPError as e:
 +            if e.code == 404:
 +                return []
 +            else:
 +                raise e
 +
 +        s = self.story.getMetadata('storyId').split('/')
 +        s.reverse()
 +        storyId_trimmed = s[0]
 +
 +        m = re.match('.*?<body[^>]*>(\s*<ul>)?(?P<content>.*?)(</body>|$)',
 +                     data, re.DOTALL)
 +        newdata = m.group('content')
 +        regex = re.compile(r'<a\ href\=\"' + storyId_trimmed +
 +                           '[a-z]?.htm\">(Continued\ [Ii]n\ |Continue\ [Oo]n\ [Tt]o\ )?(the\ )?([Nn]ext\ [Ss]ection|[Ss]ection\ [0-9IVXCL]+)</a>')
 +        newdata = re.sub(regex, '', newdata)
 +
 +
 +#        pagesections = filter(lambda x: x!=None, re.split('(?m)<hr( \/)?>|<p>\s*<hr( \/)?>\s*<\/p>', newdata, re.MULTILINE))
 +#        pagesections = filter(lambda x: x!=None, re.split('(?m)(<p>\s*)*<hr( \/)?>(\s*<\/p>)?', newdata, re.MULTILINE))
 +        pagesections = filter(lambda x: x != None, re.split('<hr( \/)?>', newdata))
 +        pagesections = filter(lambda x: x.strip() != '/', pagesections)
 +#        regex = re.compile(r'(href\="'+storyId_trimmed+'[a-z]?.htm$"')
 +#        pagesections = filter(lambda x: re.search(re.compile(storyId_trimmed + "[a-z]?.htm$"),x)==None, pagesections)
 +        pagesections.pop(0)     # always remove header
 +
 +        regex = re.compile(r'(?m)(href\="' + storyId_trimmed +
 +                           '[a-z]?.htm\"|Copyright\ held\ by\ the\ author|<p>\s*(Section\ I|Beginning),\s*</?p>)', re.MULTILINE)
 +        s = filter(lambda x: regex.search(x), pagesections)
 +#        print(s)
 +        pagesections = filter(lambda x: not regex.search(x), pagesections)
 +#        print(pagesections[0])
 +        return pagesections
 +
 +    # Getting the chapter list and the meta data, plus 'is adult' checking.
 +    def extractChapterUrlsAndMetadata(self):
 +
 +        url = self.url
 +        meta = self.getItemFromArchivePage()
 +#        print(meta)
 +
 +#         Title
 +        t = meta.a
 +        self.story.setMetadata('title', t.string.strip())
 +
 +#         Author
 +        author = meta.find('a', 'author_link')
 +        if author is not None:
 +            self.story.setMetadata('author', author.string.strip())
 +            self.story.setMetadata('authorId', author['href'].split('=')[1])
 +            self.story.setMetadata('authorUrl', author['href'])
 +            author = author.parent
 +        else:
 +            author = meta.i
 +            self.story.setMetadata('author',
 +                                   author.string.replace('Written by', '')
 +                                   .strip())
 +            self.story.setMetadata('authorId', 'unknown')
 +            self.story.setMetadata('authorUrl', 'unknown')
 +
 +
 +#         DateUpdated
 +        dUpdate = meta.find('i', text=re.compile('Last update'))
 +        du = dUpdate.replace('Last update', '').replace('.', '').strip()
 +        try:
 +            self.story.setMetadata('dateUpdated',
 +                                   makeDate(du, self.dateformat))
 +        except ValueError:
 +            self.story.setMetadata('dateUpdated', makeDate(du, "%m/%d/%Y"))
 +        compImg = meta.find('img', alt="Dot")
 +        if compImg is not None:
 +            self.story.setMetadata('status', 'Completed')
 +        else:
 +            self.story.setMetadata('status', 'In-Progress')
 +
 +
 +#         Summary & Category
 +#         Get the summary components from the meta listing
 +        metalist = meta.contents
 +        s = []
 +        for x in range(0, len(metalist)-1):
 +            item = metalist[x]
 +            if item == author or item == compImg:
 +                s = []
 +                continue
 +            if item == dUpdate or item == dUpdate.parent:
 +                break
 +            s.append(item)
 +
 +#         create a soup object from the summary components
 +        soup = self.make_soup("<p></p>")
 +        d = soup.p
 +        for x in s:
 +            d.append(x)
 +#        print(d)
 +
 +#         extract category from summary text
 +        desc = stripHTML(d)
 +        books = re.compile(r'(?P<book>\~P&P;?\~|\~Em;?\~|\~MP;?\~|\~S\&S;?\~|\~Per;?\~|\~NA;?\~|\~Juv;?\~|\~Misc;?\~)')
 +        booklist = dict({'~P&P~': 'Pride and Prejudice', '~Em~': 'Emma',
 +                        '~MP~': 'Mansfield Park', '~S&S~':
 +                         'Sense and Sensibility', '~Per~': 'Persuasion',
 +                         '~NA~': 'Northanger Abbey', '~Juv~': 'Juvenilia',
 +                         '~Misc~': 'Miscellaneous'})
 +        m = re.search(books, desc)
 +        print(m.group('book'))
 +        book = booklist.get(m.group('book').replace(';', ''))
 +        print(book)
 +        self.story.addToList('category', book)
 +
 +
 +#         assign summary info
 +        desc = stripHTML(desc).replace(book, '').strip()
 +        desc = re.sub('^.\s*', '', desc)
 +        if desc is not None:
 +            self.setDescription(url, desc)
 +
 +#        # Chapters (Sections in this case-don't know if we can subdivide them)
 +
 +#         get the last Section from the archive page link
 +#        chapters = ["https://www.dwiggie.com"+t['href']]
 +
 +#         get the section letter from the last page
 +        tempUrl = t['href']
 +        if "http://thedwg.com/" in tempUrl:
 +            tempUrl = tempUrl.replace("http://thedwg.com/", "/")
 +        elif "http://TheDWG.com/" in tempUrl:
 +            tempUrl = tempUrl.replace("http://TheDWG.com/", "/")
 +        elif "https://thedwg.com/" in tempUrl:
 +            tempUrl = tempUrl.replace("https://thedwg.com/", "/")
 +        elif "https://TheDWG.com/" in tempUrl:
 +            tempUrl = tempUrl.replace("https://TheDWG.com/", "/")
 +        m = re.match("/derby/" + self.story.getMetadata('storyId') +
 +                     "(?P<section>[a-z]?).htm$", tempUrl)
 +        inc = m.group('section')
 +        if inc == '':
 +            inc = 'a'
 +
 +#         get the presumed list of section urls with 'lower' section letters
 +        sections = []
 +        baseurl = "https://www.dwiggie.com/derby/"+self.story.getMetadata('storyId')
 +        extension = ".htm"
 +        ordend = ord(inc)
 +        ordbegin = ord('a')
 +        for numinc in range(ordbegin, ordend+1):
 +                inc = chr(numinc)
 +                if inc == 'a':
 +                    sections.append(baseurl+extension)
 +                else:
 +                    sections.append(baseurl+inc+extension)
 +
 +        # Process List of Chapters
 +        # create 'dummy' urls for individual chapters in the form
 +        # 'pageurl#pageindex' where page index is an index starting with 0 per
 +        # page
 +        c = 0
 +        postdate = None
 +        chapters = []
 +        for x in range(0, len(sections)):
 +            section = sections[x]
 +            i = 0
 +            for chapter in self.getChaptersFromPage(section):
 +                c += 1
 +                chaptersoup = self.make_soup(chapter)
 +#                self.chapterUrls.append(('Chapter '+str(c),section+'#'+str(i)))
 +                cUrl = section+'#'+str(i)
 +                t = chaptersoup.find('font', size="+1", color="#336666")
 +                ctitle = ''
 +                if t is not None:
 +                    ctitle = stripHTML(t)
 +#                self.chapterUrls.append(('Chapter '+str(c),cUrl))
 +                self.chapterUrls.append((ctitle, cUrl))
 +                chapters.append((cUrl, chaptersoup))
 +                if postdate is None:
 +                    regex = re.compile(r'Posted\ on\:?\ (?P<date>\d{4}\-\d{2}\-\d{2}|\w+,\ \d+\ \w+\ \d{4})')
 +                    # Sunday, 21 March 2004, at 6:00 a.m.
 +                    m = re.search(regex, chapter)
 +                    if m is not None:
 +                        postdate = m.group('date')
 +                i += 1
 +        self.chapters = dict(chapters)
 +#        print(postdate)
 +        pubdate = None
 +        if postdate is not None:
 +            format1 = re.match(re.compile(r'\d{4}\-\d{2}\-\d{2}'), postdate)
 +            format2 = re.match(re.compile(r'\w+,\ \d+\ \w+\ \d{4}'), postdate)
 +            if format1 is not None:
 +                pubdate = makeDate(postdate, "%Y-%m-%d")
 +            if format2 is not None:
 +                pubdate = makeDate(postdate, "%A, %d %B %Y")
 +
 +        if pubdate is None:
 +            pubdate = makeDate(self.story.getMetadata('dateUpdated'),
 +                               "%Y-%m-%d")
 +#        print(pubdate)
 +        self.story.setMetadata('datePublished', pubdate)
 +#        print(self.story.getMetadata('dateUpdated'))
 +#        print(self.story.getMetadata('datePublished'))
 +        self.story.setMetadata('numChapters', c)
 +        logger.debug("numChapters: (%s)" % self.story.getMetadata('numChapters'))
 +
 +    # grab the text for an individual chapter.
 +    def getChapterText(self, url):
 +        logger.debug('Getting chapter text from: %s' % url)
 +
 +        chapter = self.chapters.get(url)
 +#        for c in self.chapters:
 +#            if c[0] == url:
 +#                chapter = c[1]
 +#                chapter = self.make_soup(c[1])
 +
 +#        chapter = find(lambda c: c[0] == url, self.chapters)[1]
 +#        page_url = url.split('#')[0]
 +#        x = url.split('#')[1]
 +#        if self.sectionUrl != page_url:
 +#            self.sectionUrl = page_url
 +#            self.section = self.getChaptersFromPage(page_url)
 +#
 +#        chapter = self.make_soup(self.section[int(x)])
 +
 +#        chapter = self.make_soup(self.getChaptersFromPage(page_url)[int(x)])
 +
 +        return self.utf8FromSoup(url, chapter)
--- a/python-fanficfare.changes
+++ b/python-fanficfare.changes
@ -1,3 +1,8 @@
 -------------------------------------------------------------------
 Fri Mar 10 12:09:49 UTC 2023 - Matej Cepl <mcepl@suse.com>
 - Remove adapter_dwiggiecom.patch … it really doesn't work.
 -------------------------------------------------------------------
 Fri Mar  3 09:15:19 UTC 2023 - Dirk Müller <dmueller@suse.com>
--- a/python-fanficfare.spec
+++ b/python-fanficfare.spec
@ -27,9 +27,6 @@ License:        GPL-3.0-only
 Group:          Development/Languages/Python
 URL:            https://github.com/JimmXinu/FanFicFare
 Source:         https://github.com/JimmXinu/FanFicFare/archive/v%{version}/FanFicFare-%{version}.tar.gz
 # PATCH-FEATURE-OPENSUSE adapter_dwiggiecom.patch gh#JimmXinu/FanFicFare#903 mcepl@suse.com
 # adapter for dwiggie.com, which is probably not for upstream
 Patch0:         adapter_dwiggiecom.patch
 BuildRequires:  %{python_module beautifulsoup4}
 BuildRequires:  %{python_module chardet}
 BuildRequires:  %{python_module cloudscraper}