From 105eba57a72efcd437e9dfcf631c9ea571843cc28ce2df7dae27836ade23553d Mon Sep 17 00:00:00 2001 From: Matej Cepl Date: Wed, 15 Feb 2023 06:53:20 +0000 Subject: [PATCH] - Update to 4.20.0: - Fixes for adapter_fictionlive story URLs-normalize & skip unsub URL - adapter_deviantartcom date changes #910, thanks bugmaschine - Revamp retries for browser cache with open_pages_in_browser - Fix for & in chapter title. - Add r_anthmax/n_anthmax options for custom_columns_settings - Fixed the Deviantart adapter not detecting that a Deviation is Marked as needing a login, thanks bugmaschine - Skip day of week for localization in browsercache_firefox2 - Move makeDate to dateutils to call from browsercache_firefox2 - adapter_mediaminerorg: Updates for site changes - adapter_fastnovelsnet: Fixes for site changes -- tested with use_flaresolverr_proxy - Update language->langcode mapping for updated AO3 list - Browser Cache Refactor & open_pages_in_browser feature - See PR #905 and this MR post. - Fixes for config base_xenforo options, closes #902 - Fix for adapter_quotevcom status - Equalize ok/cancel buttons on user/pass & email pass dialogs - adapter_ficbooknet: Site change for status + remove debug - Tweak for adapter_storiesonlinenet description parsing - Add adapter_dwiggiecom.patch (gh#JimmXinu/FanFicFare#903) for openSUSE-only (reasons are in the patch) support for dwiggie.com. OBS-URL: https://build.opensuse.org/package/show/devel:languages:python/python-fanficfare?expand=0&rev=97 --- FanFicFare-4.19.0.tar.gz | 3 - FanFicFare-4.20.0.tar.gz | 3 + adapter_dwiggiecom.patch | 413 ++++++++++++++++++++++++++++++++++++++ python-fanficfare.changes | 29 +++ python-fanficfare.spec | 5 +- 5 files changed, 449 insertions(+), 4 deletions(-) delete mode 100644 FanFicFare-4.19.0.tar.gz create mode 100644 FanFicFare-4.20.0.tar.gz create mode 100644 adapter_dwiggiecom.patch diff --git a/FanFicFare-4.19.0.tar.gz b/FanFicFare-4.19.0.tar.gz deleted file mode 100644 index bd0dd83..0000000 --- a/FanFicFare-4.19.0.tar.gz +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:0bba6a27fbbd4a811ae30003888a7ba7cf7fdd3804b65f74625d8efafad04379 -size 1965733 diff --git a/FanFicFare-4.20.0.tar.gz b/FanFicFare-4.20.0.tar.gz new file mode 100644 index 0000000..2d4d4af --- /dev/null +++ b/FanFicFare-4.20.0.tar.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:719e86d23b0a724bb55926bf300dcf416bd6e1c4c1f64f7fe3dd3cedbd38cfa4 +size 1970872 diff --git a/adapter_dwiggiecom.patch b/adapter_dwiggiecom.patch new file mode 100644 index 0000000..b71fbab --- /dev/null +++ b/adapter_dwiggiecom.patch @@ -0,0 +1,413 @@ +From 45c6d71f57aefc3b63f2a4253eea3f730b76c6fb Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Mat=C4=9Bj=20Cepl?= +Date: Wed, 15 Feb 2023 07:38:13 +0100 +Subject: [PATCH] Add adapter_dwiggiecom, which however will not be ever pushed + upstream. + +--- + fanficfare/adapters/__init__.py | 1 + fanficfare/adapters/adapter_dwiggiecom.py | 384 ++++++++++++++++++++++++++++++ + 2 files changed, 385 insertions(+) + create mode 100644 fanficfare/adapters/adapter_dwiggiecom.py + +Index: FanFicFare-4.20.0/fanficfare/adapters/__init__.py +=================================================================== +--- FanFicFare-4.20.0.orig/fanficfare/adapters/__init__.py ++++ FanFicFare-4.20.0/fanficfare/adapters/__init__.py +@@ -160,6 +160,7 @@ from . import adapter_psychficcom + from . import adapter_deviantartcom + from . import adapter_merengohu + from . import adapter_readonlymindcom ++from . import adapter_dwiggiecom + + ## This bit of complexity allows adapters to be added by just adding + ## importing. It eliminates the long if/else clauses we used to need +Index: FanFicFare-4.20.0/fanficfare/adapters/adapter_dwiggiecom.py +=================================================================== +--- /dev/null ++++ FanFicFare-4.20.0/fanficfare/adapters/adapter_dwiggiecom.py +@@ -0,0 +1,384 @@ ++# -*- coding: utf-8 -*- ++ ++# DO NOT PROPOSE TO MERGE! THERE ARE MANY GOOD REASONS WHY DWIGGIE IS ++# AMONG ++# https://github.com/JimmXinu/FanFicFare/wiki/Supportedsites#sites-not-supported ++# See also https://github.com/JimmXinu/FanFicFare/issues/903 ++ ++# Copyright 2011 Fanficdownloader team ++# ++# Licensed under the Apache License, Version 2.0 (the "License"); ++# you may not use this file except in compliance with the License. ++# You may obtain a copy of the License at ++# ++# http://www.apache.org/licenses/LICENSE-2.0 ++# ++# Unless required by applicable law or agreed to in writing, software ++# distributed under the License is distributed on an "AS IS" BASIS, ++# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++# See the License for the specific language governing permissions and ++# limitations under the License. ++# ++ ++import logging ++import re ++ ++from ..htmlcleanup import stripHTML ++from .. import exceptions as exceptions ++from ..six.moves.urllib.error import HTTPError ++ ++from .base_adapter import BaseSiteAdapter, makeDate ++ ++logger = logging.getLogger(__name__) ++ ++ ++def getClass(): ++ return DwiggieComAdapter ++ ++# Class name has to be unique. Our convention is camel case the ++# sitename with Adapter at the end. www is skipped. ++ ++ ++class DwiggieComAdapter(BaseSiteAdapter): ++ ++ def __init__(self, config, url): ++ BaseSiteAdapter.__init__(self, config, url) ++ ++# 1252 is a superset of iso-8859-1. Most sites that claim to be ++# iso-8859-1 (and some that claim to be utf8) are really windows-1252. ++ self.decode = ["Windows-1252", "utf8"] ++ ++# if left empty, site doesn't return any message at all. ++ self.username = "NoneGiven" ++ self.password = "" ++ self.is_adult = False ++ self.sectionUrl = "" ++ self.section = [] ++ self.chapters = dict() ++ ++ ++# # get storyId from url--url validation guarantees query is only ++# # sid=1234 ++# self.story.setMetadata('storyId',self.parsedUrl.query.split('=',)[1]) ++# logger.debug("storyId: (%s)"%self.story.getMetadata('storyId')) ++ ++# get storyId from url--url validation guarantees query correct ++ m = re.match(self.getSiteURLPattern(), url) ++ if m: ++ self.story.setMetadata('storyId', m.group('id')) ++ logger.debug("storyId: (%s)" % self.story.getMetadata('storyId')) ++ # normalized story URL. ++ self._setURL('https://www.' + self.getSiteDomain() + ++ '/derby/'+self . story.getMetadata('storyId')+'.htm') ++ else: ++ raise exceptions.InvalidStoryURL(url, ++ self.getSiteDomain(), ++ self.getSiteExampleURLs()) ++ ++# Each adapter needs to have a unique site abbreviation. ++ self.story.setMetadata('siteabbrev', 'dwg') ++ ++# The date format will vary from site to site. ++# http://docs.python.org/library/datetime.html#strftime-strptime-behavior ++ self.dateformat = "%m/%d/%y" ++ ++ @staticmethod # must be @staticmethod, don't remove it. ++ def getSiteDomain(): ++ # The site domain. Does have www here, if it uses it. ++ return 'dwiggie.com' ++ ++ @classmethod ++ def getAcceptDomains(cls): ++ return ['www.dwiggie.com', 'dwiggie.com', 'thedwg.com', 'TheDWG.com'] ++ ++ def getSiteExampleURLs(self): ++ return "https://"+self.getSiteDomain()+"/derby/name1b.htm" ++ ++ def getSiteURLPattern(self): ++ # https://www.dwiggie.com/derby/mari17b.htm ++ return r"https?://(www.)?(thedwg|TheDWG|dwiggie)\.com/derby/(?P(old_\d{4}\/|old[a-z]\/)?[a-z]+\d+)(?P[a-z]*)\.htm$" ++ ++ def tryArchivePage(self, url): ++ try: ++ data = self.get_request(url) ++ ++ except HTTPError as e: ++ if e.code == 404: ++ # need to change the exception returned ++ raise exceptions.StoryDoesNotExist(self.meta) ++ else: ++ raise e ++ ++ archivesoup = self.make_soup(data) ++ m = re.compile(r"/derby/" + ++ self.story.getMetadata('storyId')+"[a-z]?.htm$") ++# print(m.pattern) ++# print(archivesoup) ++ a = archivesoup.find('a', href=m) ++ ++ return a ++ ++ def getGenre(self, url): ++ if re.search('id=E', url): ++ genre = 'Epilogue Abbey' ++ else: ++ genre = 'Fantasia Gallery' ++ self.story.addToList('genre', genre) ++ ++ def getItemFromArchivePage(self): ++ ++ urls = ["https://www.dwiggie.com/toc/index.php?id=E&page=all&comp=n", ++ "https://www.dwiggie.com/toc/index.php?id=F&page=all&comp=n"] ++ for url in urls: ++ a = self.tryArchivePage(url) ++ if a is not None: ++ self.getGenre(url) ++ return a.parent ++ else: ++ return None ++ ++ def getMetaFromSearch(self): ++ ++ params = {} ++ params['title_name'] = self.story.getMetadata('title') ++ ++ searchUrl = "https://" + self.getSiteDomain() + "/toc/search.php" ++ ++ d = self._postUrl(searchUrl, params) ++# print(d) ++ ++ searchsoup = self.make_soup(d) ++ m = re.compile(r"/derby/" + self.story.getMetadata('storyId') + ++ "[a-z]?.htm$") ++# print(m.pattern) ++# print(self.story.getMetadata('storyId')) ++ a = searchsoup.find('a', href=m) ++ ++ return a ++ ++ def getChaptersFromPage(self, url): ++ try: ++ data = self.get_request(url) ++ except HTTPError as e: ++ if e.code == 404: ++ return [] ++ else: ++ raise e ++ ++ s = self.story.getMetadata('storyId').split('/') ++ s.reverse() ++ storyId_trimmed = s[0] ++ ++ m = re.match('.*?]*>(\s*
    )?(?P.*?)(|$)', ++ data, re.DOTALL) ++ newdata = m.group('content') ++ regex = re.compile(r'(Continued\ [Ii]n\ |Continue\ [Oo]n\ [Tt]o\ )?(the\ )?([Nn]ext\ [Ss]ection|[Ss]ection\ [0-9IVXCL]+)') ++ newdata = re.sub(regex, '', newdata) ++ ++ ++# pagesections = filter(lambda x: x!=None, re.split('(?m)|

    \s*\s*<\/p>', newdata, re.MULTILINE)) ++# pagesections = filter(lambda x: x!=None, re.split('(?m)(

    \s*)*(\s*<\/p>)?', newdata, re.MULTILINE)) ++ pagesections = filter(lambda x: x != None, re.split('', newdata)) ++ pagesections = filter(lambda x: x.strip() != '/', pagesections) ++# regex = re.compile(r'(href\="'+storyId_trimmed+'[a-z]?.htm$"') ++# pagesections = filter(lambda x: re.search(re.compile(storyId_trimmed + "[a-z]?.htm$"),x)==None, pagesections) ++ pagesections.pop(0) # always remove header ++ ++ regex = re.compile(r'(?m)(href\="' + storyId_trimmed + ++ '[a-z]?.htm\"|Copyright\ held\ by\ the\ author|

    \s*(Section\ I|Beginning),\s*)', re.MULTILINE) ++ s = filter(lambda x: regex.search(x), pagesections) ++# print(s) ++ pagesections = filter(lambda x: not regex.search(x), pagesections) ++# print(pagesections[0]) ++ return pagesections ++ ++ # Getting the chapter list and the meta data, plus 'is adult' checking. ++ def extractChapterUrlsAndMetadata(self): ++ ++ url = self.url ++ meta = self.getItemFromArchivePage() ++# print(meta) ++ ++# Title ++ t = meta.a ++ self.story.setMetadata('title', t.string.strip()) ++ ++# Author ++ author = meta.find('a', 'author_link') ++ if author is not None: ++ self.story.setMetadata('author', author.string.strip()) ++ self.story.setMetadata('authorId', author['href'].split('=')[1]) ++ self.story.setMetadata('authorUrl', author['href']) ++ author = author.parent ++ else: ++ author = meta.i ++ self.story.setMetadata('author', ++ author.string.replace('Written by', '') ++ .strip()) ++ self.story.setMetadata('authorId', 'unknown') ++ self.story.setMetadata('authorUrl', 'unknown') ++ ++ ++# DateUpdated ++ dUpdate = meta.find('i', text=re.compile('Last update')) ++ du = dUpdate.replace('Last update', '').replace('.', '').strip() ++ try: ++ self.story.setMetadata('dateUpdated', ++ makeDate(du, self.dateformat)) ++ except ValueError: ++ self.story.setMetadata('dateUpdated', makeDate(du, "%m/%d/%Y")) ++ compImg = meta.find('img', alt="Dot") ++ if compImg is not None: ++ self.story.setMetadata('status', 'Completed') ++ else: ++ self.story.setMetadata('status', 'In-Progress') ++ ++ ++# Summary & Category ++# Get the summary components from the meta listing ++ metalist = meta.contents ++ s = [] ++ for x in range(0, len(metalist)-1): ++ item = metalist[x] ++ if item == author or item == compImg: ++ s = [] ++ continue ++ if item == dUpdate or item == dUpdate.parent: ++ break ++ s.append(item) ++ ++# create a soup object from the summary components ++ soup = self.make_soup("

    ") ++ d = soup.p ++ for x in s: ++ d.append(x) ++# print(d) ++ ++# extract category from summary text ++ desc = stripHTML(d) ++ books = re.compile(r'(?P\~P&P;?\~|\~Em;?\~|\~MP;?\~|\~S\&S;?\~|\~Per;?\~|\~NA;?\~|\~Juv;?\~|\~Misc;?\~)') ++ booklist = dict({'~P&P~': 'Pride and Prejudice', '~Em~': 'Emma', ++ '~MP~': 'Mansfield Park', '~S&S~': ++ 'Sense and Sensibility', '~Per~': 'Persuasion', ++ '~NA~': 'Northanger Abbey', '~Juv~': 'Juvenilia', ++ '~Misc~': 'Miscellaneous'}) ++ m = re.search(books, desc) ++ print(m.group('book')) ++ book = booklist.get(m.group('book').replace(';', '')) ++ print(book) ++ self.story.addToList('category', book) ++ ++ ++# assign summary info ++ desc = stripHTML(desc).replace(book, '').strip() ++ desc = re.sub('^.\s*', '', desc) ++ if desc is not None: ++ self.setDescription(url, desc) ++ ++# # Chapters (Sections in this case-don't know if we can subdivide them) ++ ++# get the last Section from the archive page link ++# chapters = ["https://www.dwiggie.com"+t['href']] ++ ++# get the section letter from the last page ++ tempUrl = t['href'] ++ if "http://thedwg.com/" in tempUrl: ++ tempUrl = tempUrl.replace("http://thedwg.com/", "/") ++ elif "http://TheDWG.com/" in tempUrl: ++ tempUrl = tempUrl.replace("http://TheDWG.com/", "/") ++ elif "https://thedwg.com/" in tempUrl: ++ tempUrl = tempUrl.replace("https://thedwg.com/", "/") ++ elif "https://TheDWG.com/" in tempUrl: ++ tempUrl = tempUrl.replace("https://TheDWG.com/", "/") ++ m = re.match("/derby/" + self.story.getMetadata('storyId') + ++ "(?P
    [a-z]?).htm$", tempUrl) ++ inc = m.group('section') ++ if inc == '': ++ inc = 'a' ++ ++# get the presumed list of section urls with 'lower' section letters ++ sections = [] ++ baseurl = "https://www.dwiggie.com/derby/"+self.story.getMetadata('storyId') ++ extension = ".htm" ++ ordend = ord(inc) ++ ordbegin = ord('a') ++ for numinc in range(ordbegin, ordend+1): ++ inc = chr(numinc) ++ if inc == 'a': ++ sections.append(baseurl+extension) ++ else: ++ sections.append(baseurl+inc+extension) ++ ++ # Process List of Chapters ++ # create 'dummy' urls for individual chapters in the form ++ # 'pageurl#pageindex' where page index is an index starting with 0 per ++ # page ++ c = 0 ++ postdate = None ++ chapters = [] ++ for x in range(0, len(sections)): ++ section = sections[x] ++ i = 0 ++ for chapter in self.getChaptersFromPage(section): ++ c += 1 ++ chaptersoup = self.make_soup(chapter) ++# self.chapterUrls.append(('Chapter '+str(c),section+'#'+str(i))) ++ cUrl = section+'#'+str(i) ++ t = chaptersoup.find('font', size="+1", color="#336666") ++ ctitle = '' ++ if t is not None: ++ ctitle = stripHTML(t) ++# self.chapterUrls.append(('Chapter '+str(c),cUrl)) ++ self.chapterUrls.append((ctitle, cUrl)) ++ chapters.append((cUrl, chaptersoup)) ++ if postdate is None: ++ regex = re.compile(r'Posted\ on\:?\ (?P\d{4}\-\d{2}\-\d{2}|\w+,\ \d+\ \w+\ \d{4})') ++ # Sunday, 21 March 2004, at 6:00 a.m. ++ m = re.search(regex, chapter) ++ if m is not None: ++ postdate = m.group('date') ++ i += 1 ++ self.chapters = dict(chapters) ++# print(postdate) ++ pubdate = None ++ if postdate is not None: ++ format1 = re.match(re.compile(r'\d{4}\-\d{2}\-\d{2}'), postdate) ++ format2 = re.match(re.compile(r'\w+,\ \d+\ \w+\ \d{4}'), postdate) ++ if format1 is not None: ++ pubdate = makeDate(postdate, "%Y-%m-%d") ++ if format2 is not None: ++ pubdate = makeDate(postdate, "%A, %d %B %Y") ++ ++ if pubdate is None: ++ pubdate = makeDate(self.story.getMetadata('dateUpdated'), ++ "%Y-%m-%d") ++# print(pubdate) ++ self.story.setMetadata('datePublished', pubdate) ++# print(self.story.getMetadata('dateUpdated')) ++# print(self.story.getMetadata('datePublished')) ++ self.story.setMetadata('numChapters', c) ++ logger.debug("numChapters: (%s)" % self.story.getMetadata('numChapters')) ++ ++ # grab the text for an individual chapter. ++ def getChapterText(self, url): ++ logger.debug('Getting chapter text from: %s' % url) ++ ++ chapter = self.chapters.get(url) ++# for c in self.chapters: ++# if c[0] == url: ++# chapter = c[1] ++# chapter = self.make_soup(c[1]) ++ ++# chapter = find(lambda c: c[0] == url, self.chapters)[1] ++# page_url = url.split('#')[0] ++# x = url.split('#')[1] ++# if self.sectionUrl != page_url: ++# self.sectionUrl = page_url ++# self.section = self.getChaptersFromPage(page_url) ++# ++# chapter = self.make_soup(self.section[int(x)]) ++ ++# chapter = self.make_soup(self.getChaptersFromPage(page_url)[int(x)]) ++ ++ return self.utf8FromSoup(url, chapter) diff --git a/python-fanficfare.changes b/python-fanficfare.changes index 4f75b26..85c8a93 100644 --- a/python-fanficfare.changes +++ b/python-fanficfare.changes @@ -1,3 +1,32 @@ +------------------------------------------------------------------- +Wed Feb 15 06:46:36 UTC 2023 - Matej Cepl + +- Update to 4.20.0: + - Fixes for adapter_fictionlive story URLs-normalize & skip + unsub URL + - adapter_deviantartcom date changes #910, thanks bugmaschine + - Revamp retries for browser cache with open_pages_in_browser + - Fix for & in chapter title. + - Add r_anthmax/n_anthmax options for custom_columns_settings + - Fixed the Deviantart adapter not detecting that a Deviation + is Marked as needing a login, thanks bugmaschine + - Skip day of week for localization in browsercache_firefox2 + - Move makeDate to dateutils to call from browsercache_firefox2 + - adapter_mediaminerorg: Updates for site changes + - adapter_fastnovelsnet: Fixes for site changes -- tested with + use_flaresolverr_proxy + - Update language->langcode mapping for updated AO3 list + - Browser Cache Refactor & open_pages_in_browser feature + - See PR #905 and this MR post. + - Fixes for config base_xenforo options, closes #902 + - Fix for adapter_quotevcom status + - Equalize ok/cancel buttons on user/pass & email pass dialogs + - adapter_ficbooknet: Site change for status + remove debug + - Tweak for adapter_storiesonlinenet description parsing +- Add adapter_dwiggiecom.patch (gh#JimmXinu/FanFicFare#903) + for openSUSE-only (reasons are in the patch) support for + dwiggie.com. + ------------------------------------------------------------------- Mon Jan 2 07:40:26 UTC 2023 - Matej Cepl diff --git a/python-fanficfare.spec b/python-fanficfare.spec index 6dcec7f..f342e2f 100644 --- a/python-fanficfare.spec +++ b/python-fanficfare.spec @@ -20,13 +20,16 @@ %define modnamedown fanficfare %define skip_python2 1 Name: python-fanficfare -Version: 4.19.0 +Version: 4.20.0 Release: 0 Summary: Tool for making eBooks from stories on fanfiction and other web sites License: GPL-3.0-only Group: Development/Languages/Python URL: https://github.com/JimmXinu/FanFicFare Source: https://github.com/JimmXinu/%{modname}/archive/v%{version}/%{modname}-%{version}.tar.gz +# PATCH-FEATURE-OPENSUSE adapter_dwiggiecom.patch gh#JimmXinu/FanFicFare#903 mcepl@suse.com +# adapter for dwiggie.com, which is probably not for upstream +Patch0: adapter_dwiggiecom.patch BuildRequires: %{python_module beautifulsoup4} BuildRequires: %{python_module chardet} BuildRequires: %{python_module cloudscraper}