python-fanficfare/adapter_dwiggiecom.patch

414 lines
16 KiB
Diff
Raw Normal View History

From 45c6d71f57aefc3b63f2a4253eea3f730b76c6fb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mat=C4=9Bj=20Cepl?= <mcepl@cepl.eu>
Date: Wed, 15 Feb 2023 07:38:13 +0100
Subject: [PATCH] Add adapter_dwiggiecom, which however will not be ever pushed
upstream.
---
fanficfare/adapters/__init__.py | 1
fanficfare/adapters/adapter_dwiggiecom.py | 384 ++++++++++++++++++++++++++++++
2 files changed, 385 insertions(+)
create mode 100644 fanficfare/adapters/adapter_dwiggiecom.py
Index: FanFicFare-4.20.0/fanficfare/adapters/__init__.py
===================================================================
--- FanFicFare-4.20.0.orig/fanficfare/adapters/__init__.py
+++ FanFicFare-4.20.0/fanficfare/adapters/__init__.py
@@ -160,6 +160,7 @@ from . import adapter_psychficcom
from . import adapter_deviantartcom
from . import adapter_merengohu
from . import adapter_readonlymindcom
+from . import adapter_dwiggiecom
## This bit of complexity allows adapters to be added by just adding
## importing. It eliminates the long if/else clauses we used to need
Index: FanFicFare-4.20.0/fanficfare/adapters/adapter_dwiggiecom.py
===================================================================
--- /dev/null
+++ FanFicFare-4.20.0/fanficfare/adapters/adapter_dwiggiecom.py
@@ -0,0 +1,384 @@
+# -*- coding: utf-8 -*-
+
+# DO NOT PROPOSE TO MERGE! THERE ARE MANY GOOD REASONS WHY DWIGGIE IS
+# AMONG
+# https://github.com/JimmXinu/FanFicFare/wiki/Supportedsites#sites-not-supported
+# See also https://github.com/JimmXinu/FanFicFare/issues/903
+
+# Copyright 2011 Fanficdownloader team
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import logging
+import re
+
+from ..htmlcleanup import stripHTML
+from .. import exceptions as exceptions
+from ..six.moves.urllib.error import HTTPError
+
+from .base_adapter import BaseSiteAdapter, makeDate
+
+logger = logging.getLogger(__name__)
+
+
+def getClass():
+ return DwiggieComAdapter
+
+# Class name has to be unique. Our convention is camel case the
+# sitename with Adapter at the end. www is skipped.
+
+
+class DwiggieComAdapter(BaseSiteAdapter):
+
+ def __init__(self, config, url):
+ BaseSiteAdapter.__init__(self, config, url)
+
+# 1252 is a superset of iso-8859-1. Most sites that claim to be
+# iso-8859-1 (and some that claim to be utf8) are really windows-1252.
+ self.decode = ["Windows-1252", "utf8"]
+
+# if left empty, site doesn't return any message at all.
+ self.username = "NoneGiven"
+ self.password = ""
+ self.is_adult = False
+ self.sectionUrl = ""
+ self.section = []
+ self.chapters = dict()
+
+
+# # get storyId from url--url validation guarantees query is only
+# # sid=1234
+# self.story.setMetadata('storyId',self.parsedUrl.query.split('=',)[1])
+# logger.debug("storyId: (%s)"%self.story.getMetadata('storyId'))
+
+# get storyId from url--url validation guarantees query correct
+ m = re.match(self.getSiteURLPattern(), url)
+ if m:
+ self.story.setMetadata('storyId', m.group('id'))
+ logger.debug("storyId: (%s)" % self.story.getMetadata('storyId'))
+ # normalized story URL.
+ self._setURL('https://www.' + self.getSiteDomain() +
+ '/derby/'+self . story.getMetadata('storyId')+'.htm')
+ else:
+ raise exceptions.InvalidStoryURL(url,
+ self.getSiteDomain(),
+ self.getSiteExampleURLs())
+
+# Each adapter needs to have a unique site abbreviation.
+ self.story.setMetadata('siteabbrev', 'dwg')
+
+# The date format will vary from site to site.
+# http://docs.python.org/library/datetime.html#strftime-strptime-behavior
+ self.dateformat = "%m/%d/%y"
+
+ @staticmethod # must be @staticmethod, don't remove it.
+ def getSiteDomain():
+ # The site domain. Does have www here, if it uses it.
+ return 'dwiggie.com'
+
+ @classmethod
+ def getAcceptDomains(cls):
+ return ['www.dwiggie.com', 'dwiggie.com', 'thedwg.com', 'TheDWG.com']
+
+ def getSiteExampleURLs(self):
+ return "https://"+self.getSiteDomain()+"/derby/name1b.htm"
+
+ def getSiteURLPattern(self):
+ # https://www.dwiggie.com/derby/mari17b.htm
+ return r"https?://(www.)?(thedwg|TheDWG|dwiggie)\.com/derby/(?P<id>(old_\d{4}\/|old[a-z]\/)?[a-z]+\d+)(?P<part>[a-z]*)\.htm$"
+
+ def tryArchivePage(self, url):
+ try:
+ data = self.get_request(url)
+
+ except HTTPError as e:
+ if e.code == 404:
+ # need to change the exception returned
+ raise exceptions.StoryDoesNotExist(self.meta)
+ else:
+ raise e
+
+ archivesoup = self.make_soup(data)
+ m = re.compile(r"/derby/" +
+ self.story.getMetadata('storyId')+"[a-z]?.htm$")
+# print(m.pattern)
+# print(archivesoup)
+ a = archivesoup.find('a', href=m)
+
+ return a
+
+ def getGenre(self, url):
+ if re.search('id=E', url):
+ genre = 'Epilogue Abbey'
+ else:
+ genre = 'Fantasia Gallery'
+ self.story.addToList('genre', genre)
+
+ def getItemFromArchivePage(self):
+
+ urls = ["https://www.dwiggie.com/toc/index.php?id=E&page=all&comp=n",
+ "https://www.dwiggie.com/toc/index.php?id=F&page=all&comp=n"]
+ for url in urls:
+ a = self.tryArchivePage(url)
+ if a is not None:
+ self.getGenre(url)
+ return a.parent
+ else:
+ return None
+
+ def getMetaFromSearch(self):
+
+ params = {}
+ params['title_name'] = self.story.getMetadata('title')
+
+ searchUrl = "https://" + self.getSiteDomain() + "/toc/search.php"
+
+ d = self._postUrl(searchUrl, params)
+# print(d)
+
+ searchsoup = self.make_soup(d)
+ m = re.compile(r"/derby/" + self.story.getMetadata('storyId') +
+ "[a-z]?.htm$")
+# print(m.pattern)
+# print(self.story.getMetadata('storyId'))
+ a = searchsoup.find('a', href=m)
+
+ return a
+
+ def getChaptersFromPage(self, url):
+ try:
+ data = self.get_request(url)
+ except HTTPError as e:
+ if e.code == 404:
+ return []
+ else:
+ raise e
+
+ s = self.story.getMetadata('storyId').split('/')
+ s.reverse()
+ storyId_trimmed = s[0]
+
+ m = re.match('.*?<body[^>]*>(\s*<ul>)?(?P<content>.*?)(</body>|$)',
+ data, re.DOTALL)
+ newdata = m.group('content')
+ regex = re.compile(r'<a\ href\=\"' + storyId_trimmed +
+ '[a-z]?.htm\">(Continued\ [Ii]n\ |Continue\ [Oo]n\ [Tt]o\ )?(the\ )?([Nn]ext\ [Ss]ection|[Ss]ection\ [0-9IVXCL]+)</a>')
+ newdata = re.sub(regex, '', newdata)
+
+
+# pagesections = filter(lambda x: x!=None, re.split('(?m)<hr( \/)?>|<p>\s*<hr( \/)?>\s*<\/p>', newdata, re.MULTILINE))
+# pagesections = filter(lambda x: x!=None, re.split('(?m)(<p>\s*)*<hr( \/)?>(\s*<\/p>)?', newdata, re.MULTILINE))
+ pagesections = filter(lambda x: x != None, re.split('<hr( \/)?>', newdata))
+ pagesections = filter(lambda x: x.strip() != '/', pagesections)
+# regex = re.compile(r'(href\="'+storyId_trimmed+'[a-z]?.htm$"')
+# pagesections = filter(lambda x: re.search(re.compile(storyId_trimmed + "[a-z]?.htm$"),x)==None, pagesections)
+ pagesections.pop(0) # always remove header
+
+ regex = re.compile(r'(?m)(href\="' + storyId_trimmed +
+ '[a-z]?.htm\"|Copyright\ held\ by\ the\ author|<p>\s*(Section\ I|Beginning),\s*</?p>)', re.MULTILINE)
+ s = filter(lambda x: regex.search(x), pagesections)
+# print(s)
+ pagesections = filter(lambda x: not regex.search(x), pagesections)
+# print(pagesections[0])
+ return pagesections
+
+ # Getting the chapter list and the meta data, plus 'is adult' checking.
+ def extractChapterUrlsAndMetadata(self):
+
+ url = self.url
+ meta = self.getItemFromArchivePage()
+# print(meta)
+
+# Title
+ t = meta.a
+ self.story.setMetadata('title', t.string.strip())
+
+# Author
+ author = meta.find('a', 'author_link')
+ if author is not None:
+ self.story.setMetadata('author', author.string.strip())
+ self.story.setMetadata('authorId', author['href'].split('=')[1])
+ self.story.setMetadata('authorUrl', author['href'])
+ author = author.parent
+ else:
+ author = meta.i
+ self.story.setMetadata('author',
+ author.string.replace('Written by', '')
+ .strip())
+ self.story.setMetadata('authorId', 'unknown')
+ self.story.setMetadata('authorUrl', 'unknown')
+
+
+# DateUpdated
+ dUpdate = meta.find('i', text=re.compile('Last update'))
+ du = dUpdate.replace('Last update', '').replace('.', '').strip()
+ try:
+ self.story.setMetadata('dateUpdated',
+ makeDate(du, self.dateformat))
+ except ValueError:
+ self.story.setMetadata('dateUpdated', makeDate(du, "%m/%d/%Y"))
+ compImg = meta.find('img', alt="Dot")
+ if compImg is not None:
+ self.story.setMetadata('status', 'Completed')
+ else:
+ self.story.setMetadata('status', 'In-Progress')
+
+
+# Summary & Category
+# Get the summary components from the meta listing
+ metalist = meta.contents
+ s = []
+ for x in range(0, len(metalist)-1):
+ item = metalist[x]
+ if item == author or item == compImg:
+ s = []
+ continue
+ if item == dUpdate or item == dUpdate.parent:
+ break
+ s.append(item)
+
+# create a soup object from the summary components
+ soup = self.make_soup("<p></p>")
+ d = soup.p
+ for x in s:
+ d.append(x)
+# print(d)
+
+# extract category from summary text
+ desc = stripHTML(d)
+ books = re.compile(r'(?P<book>\~P&P;?\~|\~Em;?\~|\~MP;?\~|\~S\&S;?\~|\~Per;?\~|\~NA;?\~|\~Juv;?\~|\~Misc;?\~)')
+ booklist = dict({'~P&P~': 'Pride and Prejudice', '~Em~': 'Emma',
+ '~MP~': 'Mansfield Park', '~S&S~':
+ 'Sense and Sensibility', '~Per~': 'Persuasion',
+ '~NA~': 'Northanger Abbey', '~Juv~': 'Juvenilia',
+ '~Misc~': 'Miscellaneous'})
+ m = re.search(books, desc)
+ print(m.group('book'))
+ book = booklist.get(m.group('book').replace(';', ''))
+ print(book)
+ self.story.addToList('category', book)
+
+
+# assign summary info
+ desc = stripHTML(desc).replace(book, '').strip()
+ desc = re.sub('^.\s*', '', desc)
+ if desc is not None:
+ self.setDescription(url, desc)
+
+# # Chapters (Sections in this case-don't know if we can subdivide them)
+
+# get the last Section from the archive page link
+# chapters = ["https://www.dwiggie.com"+t['href']]
+
+# get the section letter from the last page
+ tempUrl = t['href']
+ if "http://thedwg.com/" in tempUrl:
+ tempUrl = tempUrl.replace("http://thedwg.com/", "/")
+ elif "http://TheDWG.com/" in tempUrl:
+ tempUrl = tempUrl.replace("http://TheDWG.com/", "/")
+ elif "https://thedwg.com/" in tempUrl:
+ tempUrl = tempUrl.replace("https://thedwg.com/", "/")
+ elif "https://TheDWG.com/" in tempUrl:
+ tempUrl = tempUrl.replace("https://TheDWG.com/", "/")
+ m = re.match("/derby/" + self.story.getMetadata('storyId') +
+ "(?P<section>[a-z]?).htm$", tempUrl)
+ inc = m.group('section')
+ if inc == '':
+ inc = 'a'
+
+# get the presumed list of section urls with 'lower' section letters
+ sections = []
+ baseurl = "https://www.dwiggie.com/derby/"+self.story.getMetadata('storyId')
+ extension = ".htm"
+ ordend = ord(inc)
+ ordbegin = ord('a')
+ for numinc in range(ordbegin, ordend+1):
+ inc = chr(numinc)
+ if inc == 'a':
+ sections.append(baseurl+extension)
+ else:
+ sections.append(baseurl+inc+extension)
+
+ # Process List of Chapters
+ # create 'dummy' urls for individual chapters in the form
+ # 'pageurl#pageindex' where page index is an index starting with 0 per
+ # page
+ c = 0
+ postdate = None
+ chapters = []
+ for x in range(0, len(sections)):
+ section = sections[x]
+ i = 0
+ for chapter in self.getChaptersFromPage(section):
+ c += 1
+ chaptersoup = self.make_soup(chapter)
+# self.chapterUrls.append(('Chapter '+str(c),section+'#'+str(i)))
+ cUrl = section+'#'+str(i)
+ t = chaptersoup.find('font', size="+1", color="#336666")
+ ctitle = ''
+ if t is not None:
+ ctitle = stripHTML(t)
+# self.chapterUrls.append(('Chapter '+str(c),cUrl))
+ self.chapterUrls.append((ctitle, cUrl))
+ chapters.append((cUrl, chaptersoup))
+ if postdate is None:
+ regex = re.compile(r'Posted\ on\:?\ (?P<date>\d{4}\-\d{2}\-\d{2}|\w+,\ \d+\ \w+\ \d{4})')
+ # Sunday, 21 March 2004, at 6:00 a.m.
+ m = re.search(regex, chapter)
+ if m is not None:
+ postdate = m.group('date')
+ i += 1
+ self.chapters = dict(chapters)
+# print(postdate)
+ pubdate = None
+ if postdate is not None:
+ format1 = re.match(re.compile(r'\d{4}\-\d{2}\-\d{2}'), postdate)
+ format2 = re.match(re.compile(r'\w+,\ \d+\ \w+\ \d{4}'), postdate)
+ if format1 is not None:
+ pubdate = makeDate(postdate, "%Y-%m-%d")
+ if format2 is not None:
+ pubdate = makeDate(postdate, "%A, %d %B %Y")
+
+ if pubdate is None:
+ pubdate = makeDate(self.story.getMetadata('dateUpdated'),
+ "%Y-%m-%d")
+# print(pubdate)
+ self.story.setMetadata('datePublished', pubdate)
+# print(self.story.getMetadata('dateUpdated'))
+# print(self.story.getMetadata('datePublished'))
+ self.story.setMetadata('numChapters', c)
+ logger.debug("numChapters: (%s)" % self.story.getMetadata('numChapters'))
+
+ # grab the text for an individual chapter.
+ def getChapterText(self, url):
+ logger.debug('Getting chapter text from: %s' % url)
+
+ chapter = self.chapters.get(url)
+# for c in self.chapters:
+# if c[0] == url:
+# chapter = c[1]
+# chapter = self.make_soup(c[1])
+
+# chapter = find(lambda c: c[0] == url, self.chapters)[1]
+# page_url = url.split('#')[0]
+# x = url.split('#')[1]
+# if self.sectionUrl != page_url:
+# self.sectionUrl = page_url
+# self.section = self.getChaptersFromPage(page_url)
+#
+# chapter = self.make_soup(self.section[int(x)])
+
+# chapter = self.make_soup(self.getChaptersFromPage(page_url)[int(x)])
+
+ return self.utf8FromSoup(url, chapter)