From 45c6d71f57aefc3b63f2a4253eea3f730b76c6fb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mat=C4=9Bj=20Cepl?= Date: Wed, 15 Feb 2023 07:38:13 +0100 Subject: [PATCH] Add adapter_dwiggiecom, which however will not be ever pushed upstream. --- fanficfare/adapters/__init__.py | 1 fanficfare/adapters/adapter_dwiggiecom.py | 384 ++++++++++++++++++++++++++++++ 2 files changed, 385 insertions(+) create mode 100644 fanficfare/adapters/adapter_dwiggiecom.py Index: FanFicFare-4.20.0/fanficfare/adapters/__init__.py =================================================================== --- FanFicFare-4.20.0.orig/fanficfare/adapters/__init__.py +++ FanFicFare-4.20.0/fanficfare/adapters/__init__.py @@ -160,6 +160,7 @@ from . import adapter_psychficcom from . import adapter_deviantartcom from . import adapter_merengohu from . import adapter_readonlymindcom +from . import adapter_dwiggiecom ## This bit of complexity allows adapters to be added by just adding ## importing. It eliminates the long if/else clauses we used to need Index: FanFicFare-4.20.0/fanficfare/adapters/adapter_dwiggiecom.py =================================================================== --- /dev/null +++ FanFicFare-4.20.0/fanficfare/adapters/adapter_dwiggiecom.py @@ -0,0 +1,384 @@ +# -*- coding: utf-8 -*- + +# DO NOT PROPOSE TO MERGE! THERE ARE MANY GOOD REASONS WHY DWIGGIE IS +# AMONG +# https://github.com/JimmXinu/FanFicFare/wiki/Supportedsites#sites-not-supported +# See also https://github.com/JimmXinu/FanFicFare/issues/903 + +# Copyright 2011 Fanficdownloader team +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import logging +import re + +from ..htmlcleanup import stripHTML +from .. import exceptions as exceptions +from ..six.moves.urllib.error import HTTPError + +from .base_adapter import BaseSiteAdapter, makeDate + +logger = logging.getLogger(__name__) + + +def getClass(): + return DwiggieComAdapter + +# Class name has to be unique. Our convention is camel case the +# sitename with Adapter at the end. www is skipped. + + +class DwiggieComAdapter(BaseSiteAdapter): + + def __init__(self, config, url): + BaseSiteAdapter.__init__(self, config, url) + +# 1252 is a superset of iso-8859-1. Most sites that claim to be +# iso-8859-1 (and some that claim to be utf8) are really windows-1252. + self.decode = ["Windows-1252", "utf8"] + +# if left empty, site doesn't return any message at all. + self.username = "NoneGiven" + self.password = "" + self.is_adult = False + self.sectionUrl = "" + self.section = [] + self.chapters = dict() + + +# # get storyId from url--url validation guarantees query is only +# # sid=1234 +# self.story.setMetadata('storyId',self.parsedUrl.query.split('=',)[1]) +# logger.debug("storyId: (%s)"%self.story.getMetadata('storyId')) + +# get storyId from url--url validation guarantees query correct + m = re.match(self.getSiteURLPattern(), url) + if m: + self.story.setMetadata('storyId', m.group('id')) + logger.debug("storyId: (%s)" % self.story.getMetadata('storyId')) + # normalized story URL. + self._setURL('https://www.' + self.getSiteDomain() + + '/derby/'+self . story.getMetadata('storyId')+'.htm') + else: + raise exceptions.InvalidStoryURL(url, + self.getSiteDomain(), + self.getSiteExampleURLs()) + +# Each adapter needs to have a unique site abbreviation. + self.story.setMetadata('siteabbrev', 'dwg') + +# The date format will vary from site to site. +# http://docs.python.org/library/datetime.html#strftime-strptime-behavior + self.dateformat = "%m/%d/%y" + + @staticmethod # must be @staticmethod, don't remove it. + def getSiteDomain(): + # The site domain. Does have www here, if it uses it. + return 'dwiggie.com' + + @classmethod + def getAcceptDomains(cls): + return ['www.dwiggie.com', 'dwiggie.com', 'thedwg.com', 'TheDWG.com'] + + def getSiteExampleURLs(self): + return "https://"+self.getSiteDomain()+"/derby/name1b.htm" + + def getSiteURLPattern(self): + # https://www.dwiggie.com/derby/mari17b.htm + return r"https?://(www.)?(thedwg|TheDWG|dwiggie)\.com/derby/(?P(old_\d{4}\/|old[a-z]\/)?[a-z]+\d+)(?P[a-z]*)\.htm$" + + def tryArchivePage(self, url): + try: + data = self.get_request(url) + + except HTTPError as e: + if e.code == 404: + # need to change the exception returned + raise exceptions.StoryDoesNotExist(self.meta) + else: + raise e + + archivesoup = self.make_soup(data) + m = re.compile(r"/derby/" + + self.story.getMetadata('storyId')+"[a-z]?.htm$") +# print(m.pattern) +# print(archivesoup) + a = archivesoup.find('a', href=m) + + return a + + def getGenre(self, url): + if re.search('id=E', url): + genre = 'Epilogue Abbey' + else: + genre = 'Fantasia Gallery' + self.story.addToList('genre', genre) + + def getItemFromArchivePage(self): + + urls = ["https://www.dwiggie.com/toc/index.php?id=E&page=all&comp=n", + "https://www.dwiggie.com/toc/index.php?id=F&page=all&comp=n"] + for url in urls: + a = self.tryArchivePage(url) + if a is not None: + self.getGenre(url) + return a.parent + else: + return None + + def getMetaFromSearch(self): + + params = {} + params['title_name'] = self.story.getMetadata('title') + + searchUrl = "https://" + self.getSiteDomain() + "/toc/search.php" + + d = self._postUrl(searchUrl, params) +# print(d) + + searchsoup = self.make_soup(d) + m = re.compile(r"/derby/" + self.story.getMetadata('storyId') + + "[a-z]?.htm$") +# print(m.pattern) +# print(self.story.getMetadata('storyId')) + a = searchsoup.find('a', href=m) + + return a + + def getChaptersFromPage(self, url): + try: + data = self.get_request(url) + except HTTPError as e: + if e.code == 404: + return [] + else: + raise e + + s = self.story.getMetadata('storyId').split('/') + s.reverse() + storyId_trimmed = s[0] + + m = re.match('.*?]*>(\s*
    )?(?P.*?)(|$)', + data, re.DOTALL) + newdata = m.group('content') + regex = re.compile(r'(Continued\ [Ii]n\ |Continue\ [Oo]n\ [Tt]o\ )?(the\ )?([Nn]ext\ [Ss]ection|[Ss]ection\ [0-9IVXCL]+)') + newdata = re.sub(regex, '', newdata) + + +# pagesections = filter(lambda x: x!=None, re.split('(?m)|

    \s*\s*<\/p>', newdata, re.MULTILINE)) +# pagesections = filter(lambda x: x!=None, re.split('(?m)(

    \s*)*(\s*<\/p>)?', newdata, re.MULTILINE)) + pagesections = filter(lambda x: x != None, re.split('', newdata)) + pagesections = filter(lambda x: x.strip() != '/', pagesections) +# regex = re.compile(r'(href\="'+storyId_trimmed+'[a-z]?.htm$"') +# pagesections = filter(lambda x: re.search(re.compile(storyId_trimmed + "[a-z]?.htm$"),x)==None, pagesections) + pagesections.pop(0) # always remove header + + regex = re.compile(r'(?m)(href\="' + storyId_trimmed + + '[a-z]?.htm\"|Copyright\ held\ by\ the\ author|

    \s*(Section\ I|Beginning),\s*)', re.MULTILINE) + s = filter(lambda x: regex.search(x), pagesections) +# print(s) + pagesections = filter(lambda x: not regex.search(x), pagesections) +# print(pagesections[0]) + return pagesections + + # Getting the chapter list and the meta data, plus 'is adult' checking. + def extractChapterUrlsAndMetadata(self): + + url = self.url + meta = self.getItemFromArchivePage() +# print(meta) + +# Title + t = meta.a + self.story.setMetadata('title', t.string.strip()) + +# Author + author = meta.find('a', 'author_link') + if author is not None: + self.story.setMetadata('author', author.string.strip()) + self.story.setMetadata('authorId', author['href'].split('=')[1]) + self.story.setMetadata('authorUrl', author['href']) + author = author.parent + else: + author = meta.i + self.story.setMetadata('author', + author.string.replace('Written by', '') + .strip()) + self.story.setMetadata('authorId', 'unknown') + self.story.setMetadata('authorUrl', 'unknown') + + +# DateUpdated + dUpdate = meta.find('i', text=re.compile('Last update')) + du = dUpdate.replace('Last update', '').replace('.', '').strip() + try: + self.story.setMetadata('dateUpdated', + makeDate(du, self.dateformat)) + except ValueError: + self.story.setMetadata('dateUpdated', makeDate(du, "%m/%d/%Y")) + compImg = meta.find('img', alt="Dot") + if compImg is not None: + self.story.setMetadata('status', 'Completed') + else: + self.story.setMetadata('status', 'In-Progress') + + +# Summary & Category +# Get the summary components from the meta listing + metalist = meta.contents + s = [] + for x in range(0, len(metalist)-1): + item = metalist[x] + if item == author or item == compImg: + s = [] + continue + if item == dUpdate or item == dUpdate.parent: + break + s.append(item) + +# create a soup object from the summary components + soup = self.make_soup("

    ") + d = soup.p + for x in s: + d.append(x) +# print(d) + +# extract category from summary text + desc = stripHTML(d) + books = re.compile(r'(?P\~P&P;?\~|\~Em;?\~|\~MP;?\~|\~S\&S;?\~|\~Per;?\~|\~NA;?\~|\~Juv;?\~|\~Misc;?\~)') + booklist = dict({'~P&P~': 'Pride and Prejudice', '~Em~': 'Emma', + '~MP~': 'Mansfield Park', '~S&S~': + 'Sense and Sensibility', '~Per~': 'Persuasion', + '~NA~': 'Northanger Abbey', '~Juv~': 'Juvenilia', + '~Misc~': 'Miscellaneous'}) + m = re.search(books, desc) + print(m.group('book')) + book = booklist.get(m.group('book').replace(';', '')) + print(book) + self.story.addToList('category', book) + + +# assign summary info + desc = stripHTML(desc).replace(book, '').strip() + desc = re.sub('^.\s*', '', desc) + if desc is not None: + self.setDescription(url, desc) + +# # Chapters (Sections in this case-don't know if we can subdivide them) + +# get the last Section from the archive page link +# chapters = ["https://www.dwiggie.com"+t['href']] + +# get the section letter from the last page + tempUrl = t['href'] + if "http://thedwg.com/" in tempUrl: + tempUrl = tempUrl.replace("http://thedwg.com/", "/") + elif "http://TheDWG.com/" in tempUrl: + tempUrl = tempUrl.replace("http://TheDWG.com/", "/") + elif "https://thedwg.com/" in tempUrl: + tempUrl = tempUrl.replace("https://thedwg.com/", "/") + elif "https://TheDWG.com/" in tempUrl: + tempUrl = tempUrl.replace("https://TheDWG.com/", "/") + m = re.match("/derby/" + self.story.getMetadata('storyId') + + "(?P
    [a-z]?).htm$", tempUrl) + inc = m.group('section') + if inc == '': + inc = 'a' + +# get the presumed list of section urls with 'lower' section letters + sections = [] + baseurl = "https://www.dwiggie.com/derby/"+self.story.getMetadata('storyId') + extension = ".htm" + ordend = ord(inc) + ordbegin = ord('a') + for numinc in range(ordbegin, ordend+1): + inc = chr(numinc) + if inc == 'a': + sections.append(baseurl+extension) + else: + sections.append(baseurl+inc+extension) + + # Process List of Chapters + # create 'dummy' urls for individual chapters in the form + # 'pageurl#pageindex' where page index is an index starting with 0 per + # page + c = 0 + postdate = None + chapters = [] + for x in range(0, len(sections)): + section = sections[x] + i = 0 + for chapter in self.getChaptersFromPage(section): + c += 1 + chaptersoup = self.make_soup(chapter) +# self.chapterUrls.append(('Chapter '+str(c),section+'#'+str(i))) + cUrl = section+'#'+str(i) + t = chaptersoup.find('font', size="+1", color="#336666") + ctitle = '' + if t is not None: + ctitle = stripHTML(t) +# self.chapterUrls.append(('Chapter '+str(c),cUrl)) + self.chapterUrls.append((ctitle, cUrl)) + chapters.append((cUrl, chaptersoup)) + if postdate is None: + regex = re.compile(r'Posted\ on\:?\ (?P\d{4}\-\d{2}\-\d{2}|\w+,\ \d+\ \w+\ \d{4})') + # Sunday, 21 March 2004, at 6:00 a.m. + m = re.search(regex, chapter) + if m is not None: + postdate = m.group('date') + i += 1 + self.chapters = dict(chapters) +# print(postdate) + pubdate = None + if postdate is not None: + format1 = re.match(re.compile(r'\d{4}\-\d{2}\-\d{2}'), postdate) + format2 = re.match(re.compile(r'\w+,\ \d+\ \w+\ \d{4}'), postdate) + if format1 is not None: + pubdate = makeDate(postdate, "%Y-%m-%d") + if format2 is not None: + pubdate = makeDate(postdate, "%A, %d %B %Y") + + if pubdate is None: + pubdate = makeDate(self.story.getMetadata('dateUpdated'), + "%Y-%m-%d") +# print(pubdate) + self.story.setMetadata('datePublished', pubdate) +# print(self.story.getMetadata('dateUpdated')) +# print(self.story.getMetadata('datePublished')) + self.story.setMetadata('numChapters', c) + logger.debug("numChapters: (%s)" % self.story.getMetadata('numChapters')) + + # grab the text for an individual chapter. + def getChapterText(self, url): + logger.debug('Getting chapter text from: %s' % url) + + chapter = self.chapters.get(url) +# for c in self.chapters: +# if c[0] == url: +# chapter = c[1] +# chapter = self.make_soup(c[1]) + +# chapter = find(lambda c: c[0] == url, self.chapters)[1] +# page_url = url.split('#')[0] +# x = url.split('#')[1] +# if self.sectionUrl != page_url: +# self.sectionUrl = page_url +# self.section = self.getChaptersFromPage(page_url) +# +# chapter = self.make_soup(self.section[int(x)]) + +# chapter = self.make_soup(self.getChaptersFromPage(page_url)[int(x)]) + + return self.utf8FromSoup(url, chapter)