2008-11-22 Christopher Blizzard <blizzard@0xdeadbeef.com>
authorblizzard <blizzard@ae879524-a8bd-4c4c-a5ea-74d2e5fc5a2c>
Sun, 23 Nov 2008 04:41:42 +0000 (04:41 +0000)
committerblizzard <blizzard@ae879524-a8bd-4c4c-a5ea-74d2e5fc5a2c>
Sun, 23 Nov 2008 04:41:42 +0000 (04:41 +0000)
        * whoisi/test_controller.py (TestController.modified): Now returns
        a valid RSS feed + data type for tests.

        * whoisi/model.py (Site): Add etag, last_modified and entity_url
        entries to the site table.

        * tests/twisted/network/test_download.py: Lots of changes here to
        support the new download command format (using hash instead of
        direct call.)

        * tests/twisted/local/test_feedparse_perf.py: Same.

        * tests/twisted/local/test_feedparse.py: Same.

        * tests/twisted/network/test_feedrefresh.py
        (TestFeedRefresh.test_RefreshSiteManagerEntityProperties): This
        test makes sure that we set entity properties in the site table
        after we hit a site that includes them.
        (TestFeedRefresh.test_RefreshSiteManagerEntityHit): This test
        makes sure that we return early and don't parse when we send a
        matching etag or last-modified along with a request.

        * services/command/siterefresh.py (RefreshSiteDone.srDone): Save
        etag, last-modified and entity_url info in the site if we have it.
        (RefreshSiteDone.done): When returning the data to the master
        process add a http_entity_hit=0 in the dict so we know we did a
        download. (For future use.)
        (RefreshSiteError.handleError): Handle the DownloadCommand
        throwing a NotModifiedError which means that we don't have to do
        any parsing or updating of information.  Short cut to exit.
        Return value will include a http_entity_hit=1 for future use.  We
        also set the error field to http_not_modified when we hit this
        condition.  Also update the error field in the SiteRefresh table
        when there's a real error.

        * services/command/controller.py (RefreshManager.__init__): Use
        new DownloadResourceSaveState after a download as part of a
        refresh.

        * services/command/newsite.py (NewSiteTryURL.doCommand): When
        calling the download command pass in the url as part of a
        dictionary.
        (NewSiteTryURL.downloadDone): More args["filename"] changes.
        (NewSiteTryURL.startSecondDownload): Same.
        (NewSiteTryURL.secondDownloadDone): Same.
        (NewSiteTryURL.tryFeed): Same.

        * services/command/download.py (DownloadResourceSaveState): Shim
        command that takes the download data and saves it into the state
        for later commands.
        (DownloadCommand.doCommand): New code to handle etag,
        last_modified and entity_url info as arguments to this command.
        (DownloadCommand.downloadDone): Data is now returned as a hash
        that includes filename, etag, last_modified and the url stack of
        downloads.

        * services/command/feedparse.py (FeedRefreshSetup.gotNewSite):
        Gets the etag, last_modified and entity_url out of the database
        when setting up for a feed refresh.
        (FeedRefreshSetup.gotFeed): When returning with a setup refresh
        the next command is the download so set up everything the download
        needs to send an etag + last-modified header if we can.
        (FeedParseCommand.doCommand): Convert to use args["filename"]
        instead of just filename since the downloadcommand now returns
        more than just the filename.

        * services/command/linkedin.py (LinkedInScrapeCommand.doCommand):
        Convert linkedin code to use a hash["filename"] instead of just
        the filename.

git-svn-id: svn://trac.whoisi.com/whoisi/trunk@13 ae879524-a8bd-4c4c-a5ea-74d2e5fc5a2c

13 files changed:
ChangeLog
services/command/controller.py
services/command/download.py
services/command/feedparse.py
services/command/linkedin.py
services/command/newsite.py
services/command/siterefresh.py
tests/twisted/local/test_feedparse.py
tests/twisted/local/test_feedparse_perf.py
tests/twisted/network/test_download.py
tests/twisted/network/test_feedrefresh.py
whoisi/model.py
whoisi/test_controller.py

index a37117e..5a73b4a 100644 (file)
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,75 @@
+2008-11-22  Christopher Blizzard  <blizzard@0xdeadbeef.com>
+
+       * whoisi/test_controller.py (TestController.modified): Now returns
+       a valid RSS feed + data type for tests.
+
+       * whoisi/model.py (Site): Add etag, last_modified and entity_url
+       entries to the site table.
+
+       * tests/twisted/network/test_download.py: Lots of changes here to
+       support the new download command format (using hash instead of
+       direct call.)
+
+       * tests/twisted/local/test_feedparse_perf.py: Same.
+
+       * tests/twisted/local/test_feedparse.py: Same.
+
+       * tests/twisted/network/test_feedrefresh.py
+       (TestFeedRefresh.test_RefreshSiteManagerEntityProperties): This
+       test makes sure that we set entity properties in the site table
+       after we hit a site that includes them.
+       (TestFeedRefresh.test_RefreshSiteManagerEntityHit): This test
+       makes sure that we return early and don't parse when we send a
+       matching etag or last-modified along with a request.
+
+       * services/command/siterefresh.py (RefreshSiteDone.srDone): Save
+       etag, last-modified and entity_url info in the site if we have it.
+       (RefreshSiteDone.done): When returning the data to the master
+       process add a http_entity_hit=0 in the dict so we know we did a
+       download. (For future use.)
+       (RefreshSiteError.handleError): Handle the DownloadCommand
+       throwing a NotModifiedError which means that we don't have to do
+       any parsing or updating of information.  Short cut to exit.
+       Return value will include a http_entity_hit=1 for future use.  We
+       also set the error field to http_not_modified when we hit this
+       condition.  Also update the error field in the SiteRefresh table
+       when there's a real error.
+
+       * services/command/controller.py (RefreshManager.__init__): Use
+       new DownloadResourceSaveState after a download as part of a
+       refresh.
+
+       * services/command/newsite.py (NewSiteTryURL.doCommand): When
+       calling the download command pass in the url as part of a
+       dictionary.
+       (NewSiteTryURL.downloadDone): More args["filename"] changes.
+       (NewSiteTryURL.startSecondDownload): Same.
+       (NewSiteTryURL.secondDownloadDone): Same.
+       (NewSiteTryURL.tryFeed): Same.
+
+       * services/command/download.py (DownloadResourceSaveState): Shim
+       command that takes the download data and saves it into the state
+       for later commands.
+       (DownloadCommand.doCommand): New code to handle etag,
+       last_modified and entity_url info as arguments to this command.
+       (DownloadCommand.downloadDone): Data is now returned as a hash
+       that includes filename, etag, last_modified and the url stack of
+       downloads.
+
+       * services/command/feedparse.py (FeedRefreshSetup.gotNewSite):
+       Gets the etag, last_modified and entity_url out of the database
+       when setting up for a feed refresh.
+       (FeedRefreshSetup.gotFeed): When returning with a setup refresh
+       the next command is the download so set up everything the download
+       needs to send an etag + last-modified header if we can.
+       (FeedParseCommand.doCommand): Convert to use args["filename"]
+       instead of just filename since the downloadcommand now returns
+       more than just the filename.
+
+       * services/command/linkedin.py (LinkedInScrapeCommand.doCommand):
+       Convert linkedin code to use a hash["filename"] instead of just
+       the filename.
+
 2008-11-18  Christopher Blizzard  <blizzard@0xdeadbeef.com>
 
        * test-ws.cfg: Enable base_url_filter.base_url to localhost:9090
index 872d0e5..122b77b 100644 (file)
@@ -24,7 +24,7 @@ from twisted.internet import defer
 
 from services.command.base import CommandManager
 
-from services.command.download import DownloadCommand
+from services.command.download import DownloadCommand, DownloadResourceSaveState
 from services.command.newsite import NewSiteSetup, NewSiteTryURL, \
     NewSiteDone, NewSiteError, NewSiteCreate, NewSiteAudit
 from services.command.feedparse import FeedParseCommand, \
@@ -73,6 +73,7 @@ class RefreshManager(ProtoManager):
         CommandManager.__init__(self)
         self.commands = [ FeedRefreshSetup(dcm),          # get the feed url from the database
                           DownloadCommand(),              # download the feed
+                          DownloadResourceSaveState(),    # save the etag and last-modified from the download
                           FeedParseCommand(sm),           # send it out to the parser
                           FeedUpdateDatabaseCommand(dcm), # update the database
                           RefreshSiteDone(dcm)            # update our status
index 15f44e0..99c0414 100644 (file)
@@ -52,6 +52,26 @@ def confirm_twitter_info():
     except:
         pass
 
+class DownloadResourceSaveState(BaseCommand):
+    """
+    This takes the etag and last-modified data from a download and
+    saves it into the state.
+    """
+    def __init__(self):
+        BaseCommand.__init__(self)
+        self.name = "download"
+
+    def doCommand(self, state, args):
+        d = defer.Deferred()
+
+        state["download_etag"] = args.get("etag", None)
+        state["download_last_modified"] = args.get("last_modified", None)
+        state["download_url_stack"] = args.get("url_stack", None)
+
+        d.callback(dict(filename=args["filename"]))
+
+        return d
+
 class DownloadCommand(BaseCommand):
     """
     DownloadCommand takes a URL as an argument and will return a
@@ -63,15 +83,38 @@ class DownloadCommand(BaseCommand):
         self.d = defer.Deferred()
         self.url_stack = []
 
-    def doCommand(self, state, url=None,
-                  etag=None, last_modified=None, entity_url=None,
-                  *args, **kw):
+    def doCommand(self, state, arg, *args, **kw):
         """
-        Pass in a url.  This will return a deferred that will
-        eventually call back to you with a (result, filename) pair to
-        where the url has been downloaded.  It is your responsibility
-        to delete the file once you have finished with it.
+        Pass in a dict that includes url and optional etag +
+        last_modified information.  This will return a deferred that
+        will eventually call back to you with a (result,
+        dict(filename, etag, last_modified, url_stack)) pair to where
+        the url has been downloaded.  It is your responsibility to
+        delete the file once you have finished with it.
         """
+
+        url = None
+        etag = None
+        last_modified = None
+        entity_url = None
+
+        if arg:
+            url = arg.get("url", None)
+            etag = arg.get("etag", None)
+            last_modified = arg.get("last_modified", None)
+            entity_url = arg.get("entity_url", None)
+
+        # have to clean these up because the http code in twisted
+        # loses its crap if we pass down unicode data
+        if url:
+            url = str(url)
+        if etag:
+            etag = str(etag)
+        if last_modified:
+            last_modified = str(last_modified)
+        if entity_url:
+            entity_url = str(entity_url)
+
         self.state = state
         self.etag = etag
         self.last_modified = last_modified
@@ -136,10 +179,14 @@ class DownloadCommand(BaseCommand):
         print("  etag %s" % str(etag))
         print("  last-modified %s" % str(last_modified))
         print("  url stack %s" % str(self.url_stack))
-        self.state["download_etag"] = result["etag"]
-        self.state["download_last_modified"] = result["last_modified"]
-        self.state["download_url_stack"] = self.url_stack
-        self.d.callback(filename)
+
+        retval = dict()
+        retval["filename"] = filename
+        retval["etag"] = result["etag"]
+        retval["last_modified"] = result["last_modified"]
+        retval["url_stack"] = self.url_stack
+
+        self.d.callback(retval)
 
     def downloadError(self, failure):
         if failure.check(error.PageRedirect):
index 6d3881a..465a238 100644 (file)
@@ -68,7 +68,7 @@ class FeedRefreshSetup(BaseCommand):
 
         # Get the info for this feed out of the database
         q = """
-            SELECT feed, url FROM site WHERE id = %s
+            SELECT feed, url, etag, last_modified, entity_url FROM site WHERE id = %s
             """
 
         d = self.dcm.runQuery(q, site_id)
@@ -78,9 +78,18 @@ class FeedRefreshSetup(BaseCommand):
     def gotFeed(self, results):
         self.state["feed_url"] = results[0][0]
         self.state["url"] = results[0][1]
-        print("  feed url is %s" % self.state["feed_url"])
-        print("  base url is %s" % self.state["url"])
-        self.d.callback(self.state["feed_url"])
+        self.state["etag"] = results[0][2]
+        self.state["last_modified"] = results[0][3]
+        self.state["entity_url"] = results[0][4]
+        print("  feed url is      %s" % self.state["feed_url"])
+        print("  base url is      %s" % self.state["url"])
+        print("  etag is          %s" % self.state["etag"])
+        print("  last_modified is %s" % self.state["last_modified"])
+        print("  entity_url is    %s" % self.state["entity_url"])
+        self.d.callback(dict(url = self.state["feed_url"],
+                             etag = self.state["etag"],
+                             last_modified = self.state["last_modified"],
+                             entity_url = self.state["entity_url"]))
 
     def error(self, failure):
         print("got error: %s" % failure.getErrorMessage())
@@ -97,13 +106,17 @@ class FeedParseCommand(BaseCommand):
         ss = None
         self.name = "feedparse"
 
-    def doCommand(self, state, filename):
+    def doCommand(self, state, args):
         """
         This command will take the filename that's passed in from the
         RSS download and parse it.  It will return a filename that
         contains the parsed contents.  It will also check if there's a
         pre-parsed file and just return that immediately if it can.
         """
+        filename = None
+        if args:
+            filename = args["filename"]
+
         if state.has_key("try_url_parsed_feed_filename"):
             d = defer.Deferred()
             d.callback(state["try_url_parsed_feed_filename"])
index 21deacf..df63279 100644 (file)
@@ -254,7 +254,8 @@ class LinkedInScrapeCommand(BaseCommand):
         self.name = "linkedin-scrape"
         self.d = None
 
-    def doCommand(self, state, filename):
+    def doCommand(self, state, args):
+        filename = args["filename"]
         self.state = state
         self.entries = None
         self.found_user = False
index 8d73cc3..d8f5be4 100644 (file)
@@ -175,7 +175,7 @@ class NewSiteTryURL(BaseCommand):
 
         # Start off by trying to download the page
         download = DownloadCommand()
-        d = download.doCommand(self.state, str(self.state["try_url"]))
+        d = download.doCommand(self.state, dict(url=str(self.state["try_url"])))
         d.addCallback(self.downloadDone)
         d.addErrback(self.downloadError)
 
@@ -185,7 +185,8 @@ class NewSiteTryURL(BaseCommand):
 # Downloading the first URL
 ####
 
-    def downloadDone(self, filename):
+    def downloadDone(self, args):
+        filename = args["filename"]
         print("  download done filename: %s" % filename)
         self.state["try_url_filename"] = filename
         # assume it's HTML and try to scrape out any link header information
@@ -312,11 +313,12 @@ class NewSiteTryURL(BaseCommand):
 
     def startSecondDownload(self):
         download = DownloadCommand()
-        d = download.doCommand(self.state, str(self.state["feed_url"]))
+        d = download.doCommand(self.state, dict(url=str(self.state["feed_url"])))
         d.addCallback(self.secondDownloadDone)
         d.addErrback(self.secondDownloadFailed)
 
-    def secondDownloadDone(self, filename):
+    def secondDownloadDone(self, args):
+        filename = args["filename"]
         print("  second download done")
         self.state["try_url_filename"] = filename
         self.tryFeed()
@@ -332,7 +334,7 @@ class NewSiteTryURL(BaseCommand):
     def tryFeed(self):
         feed_parse = FeedParseCommand(self.sm)
         print("  XXX parsing feed, but need to fix relative URLs first here somehow")
-        d = feed_parse.doCommand(self.state, self.state["try_url_filename"])
+        d = feed_parse.doCommand(self.state, dict(filename=self.state["try_url_filename"]))
         d.addCallback(self.feedParseSucceeded)
         d.addErrback(self.feedParseFailed)
 
index dacc224..9a6c512 100644 (file)
@@ -24,6 +24,7 @@
 
 from twisted.internet import defer
 from services.command.base import BaseCommand
+from services.command.exceptions import NotModifiedError
 
 import datetime
 
@@ -52,15 +53,28 @@ class RefreshSiteDone(BaseCommand):
     def srDone(self, *args, **kw):
         id = self.state["site_id"]
         q = """
-            UPDATE site SET last_poll = %s where id = %s
+            UPDATE site SET last_poll = %s, etag = %s, last_modified = %s, entity_url = %s where id = %s
             """
-        d = self.dcm.runQuery(q, (datetime.datetime.utcnow(), id))
+
+        entity_url = None
+
+        try:
+            entity_url = self.state["download_url_stack"][-1]
+        except:
+            pass
+
+        d = self.dcm.runQuery(q, (datetime.datetime.utcnow(),
+                                  self.state.get("download_etag", None),
+                                  self.state.get("download_last_modified", None),
+                                  entity_url,
+                                  id))
         d.addCallback(self.done)
         d.addErrback(self.error)
 
     def done(self, *args, **kw):
         # return any new site_history_new_ids if we have them
-        retval = dict(site_history_new_ids = self.state.get("site_history_new_ids", []))
+        retval = dict(site_history_new_ids = self.state.get("site_history_new_ids", []),
+                      http_entity_hit=0)
         
         self.d.callback(retval)
 
@@ -77,17 +91,33 @@ class RefreshSiteError:
         id = state["site_refresh_id"]
         self.orig_failure = failure
 
-        # XXX add an error here at some point
+        self.it_failed = True
 
-        q = """
-            UPDATE site_refresh SET status = "error", error = %s where id = %s
-            """
+        if failure.check(NotModifiedError):
+            q = """
+                UPDATE site_refresh set status = "done", error = %s where id = %s
+                """
+            error = "http_not_modified"
+            self.it_failed = False
 
-        d = self.dcm.runQuery(q, (None, id))
+        else:
+
+            q = """
+                UPDATE site_refresh SET status = "error", error = %s where id = %s
+                """
+            error = str(failure)
+        
+        d = self.dcm.runQuery(q, (error, id))
 
         d.addCallback(self.updateDone)
 
         return self.d
 
     def updateDone(self, *args, **kw):
-        self.d.errback(self.orig_failure)
+        if self.it_failed:
+            self.d.errback(self.orig_failure)
+            return
+
+        # this makes the error handler return success
+        retval = dict(site_history_new_ids = [], http_entity_hit=1)
+        self.d.callback(retval)
index 95f8b79..171e435 100644 (file)
@@ -55,7 +55,7 @@ class TestFeedParse(unittest.TestCase):
         self.fpc = FeedParseCommand(self.sm)
         self.state["url"] = "http://www.0xdeadbeef.com/weblog/"
         self.state["feed_url"] = "http://www.0xdeadbeef.com/weblog/feed=rss2"
-        d = self.fpc.doCommand(self.state, "../tests/twisted/local/data/beef-2.rss2")
+        d = self.fpc.doCommand(self.state, dict(filename="../tests/twisted/local/data/beef-2.rss2"))
         d.addCallback(self.feedParseParsed)
         return d
 
@@ -81,7 +81,7 @@ class TestFeedParse(unittest.TestCase):
 
         # now that we've loaded the database, see if we add an entry successfully
         self.fpc = FeedParseCommand(self.sm)
-        d = self.fpc.doCommand(self.state, "../tests/twisted/local/data/beef.rss2")
+        d = self.fpc.doCommand(self.state, dict(filename="../tests/twisted/local/data/beef.rss2"))
         d.addCallback(self.feedParseUpdated)
         return d
 
@@ -103,7 +103,7 @@ class TestFeedParse(unittest.TestCase):
         self.fpc = FeedParseCommand(self.sm)
         self.state["url"] = "http://www.0xdeadbeef.com/weblog/"
         self.state["feed_url"] = "http://www.0xdeadbeef.com/weblog/feed=rss2"
-        d = self.fpc.doCommand(self.state, "../tests/twisted/local/data/beef-no-ids-2.rss2")
+        d = self.fpc.doCommand(self.state, dict(filename="../tests/twisted/local/data/beef-no-ids-2.rss2"))
         d.addCallback(self.stupidFeedParseParsed)
         return d
 
@@ -133,7 +133,7 @@ class TestFeedParse(unittest.TestCase):
         # a new entry since none of the entries have ids associated
         # with them
         self.fpc = FeedParseCommand(self.sm)
-        d = self.fpc.doCommand(self.state, "../tests/twisted/local/data/beef-no-ids.rss2")
+        d = self.fpc.doCommand(self.state, dict(filename="../tests/twisted/local/data/beef-no-ids.rss2"))
         d.addCallback(self.stupidFeedParseUpdated)
         return d
 
index 340cb90..3d9bb0b 100644 (file)
@@ -30,7 +30,7 @@ class TestFeedParsePerf(unittest.TestCase):
 
     def doIteration(self):
         self.fpc = FeedParseCommand(self.sm)
-        d = self.fpc.doCommand(self.state, "../tests/twisted/local/data/beef-2.rss2")
+        d = self.fpc.doCommand(self.state, dict(filename="../tests/twisted/local/data/beef-2.rss2"))
         d.addCallback(self.parseDone)
 
     def parseDone(self, filename):
index 4280f5f..3b57b99 100644 (file)
@@ -113,8 +113,8 @@ class TestDownload(unittest.TestCase):
         state = dict()
         state["url"] = "http://localhost:9090/test/modified"
 
-        d = c.doCommand(state, etag="abc123",
-                        entity_url="http://localhost:9090/test/modified")
+        d = c.doCommand(state, dict(etag="abc123",
+                                    entity_url="http://localhost:9090/test/modified"))
 
         d.addCallback(lambda x: unittest.fail("should give a 304"))
         d.addErrback(lambda f: f.trap(NotModifiedError))
@@ -130,15 +130,17 @@ class TestDownload(unittest.TestCase):
         self.state = dict()
         self.state["url"] = "http://localhost:9090/test/modified"
 
-        d = c.doCommand(self.state, etag="abc123x",
-                        entity_url="http://localhost:9090/test/modified")
+        d = c.doCommand(self.state, dict(etag="abc123x",
+                                         entity_url="http://localhost:9090/test/modified"))
 
         d.addCallback(self._downloadDoneCheckETag)
 
         return d
 
-    def _downloadDoneCheckETag(self, filename):
-        etag = self.state["download_etag"]
+    def _downloadDoneCheckETag(self, args):
+        filename = args["filename"]
+        etag = args["etag"]
+
         assert(filename)
         print("filename %s" % filename)
         print("etag %s" % etag)
@@ -153,9 +155,8 @@ class TestDownload(unittest.TestCase):
         state = dict()
         state["url"] = "http://localhost:9090/test/modified"
 
-        d = c.doCommand(state,
-                        last_modified="Mon, 03 Nov 2008 01:27:18 GMT",
-                        entity_url="http://localhost:9090/test/modified")
+        d = c.doCommand(state, dict(last_modified="Mon, 03 Nov 2008 01:27:18 GMT",
+                                    entity_url="http://localhost:9090/test/modified"))
 
         d.addCallback(lambda x: unittest.fail("should give a 304"))
         d.addErrback(lambda f: f.trap(NotModifiedError))
@@ -171,16 +172,17 @@ class TestDownload(unittest.TestCase):
         self.state = dict()
         self.state["url"] = "http://localhost:9090/test/modified"
 
-        d = c.doCommand(self.state,
-                        last_modified="Mon, 03 Nov 2008 01:27:19 GMT",
-                        entity_url="http://localhost:9090/test/modified")
+        d = c.doCommand(self.state, dict(last_modified="Mon, 03 Nov 2008 01:27:19 GMT",
+                                         entity_url="http://localhost:9090/test/modified"))
 
         d.addCallback(self._downloadDoneCheckLastModified)
 
         return d
 
-    def _downloadDoneCheckLastModified(self, filename):
-        last_modified = self.state["download_last_modified"]
+    def _downloadDoneCheckLastModified(self, args):
+        filename = args["filename"]
+        last_modified = args["last_modified"]
+
         assert(filename)
         print("filename %s" % filename)
         print("last_modified %s" % last_modified)
@@ -195,7 +197,7 @@ class TestDownload(unittest.TestCase):
         self.state = dict()
         self.state["url"] = "http://localhost:9090/test/redirect1"
 
-        d = c.doCommand(self.state)
+        d = c.doCommand(self.state, None)
 
         d.addCallback(self._downloadDone)
 
@@ -210,7 +212,7 @@ class TestDownload(unittest.TestCase):
         self.state = dict()
         self.state["url"] = "http://localhost:9090/test/redirect0"
 
-        d = c.doCommand(self.state)
+        d = c.doCommand(self.state, None)
 
         d.addCallback(lambda x: unittest.fail("should get too many redirects"))
         d.addErrback(lambda f: f.trap(TooManyRedirectsError))
index 17f803c..b4e6f1d 100644 (file)
@@ -139,3 +139,72 @@ class TestFeedRefresh(unittest.TestCase):
         d.addErrback(self.confirmRefreshFailed)
 
         return d
+
+    # This tests refreshing a feed and getting an etag
+    def test_RefreshSiteManagerEntityProperties(self):
+        self.setupManagers()
+        model = self.m.model
+        p = model.Person(name="No One")
+        self.site = model.Site(person=p, url="http://localhost:9090/test/modified", type="feed",
+                               feed="http://localhost:9090/test/modified",
+                               feedType="rss2", title="something", created=datetime.utcnow(),
+                               lastUpdate = None,
+                               lastPoll = None, current = None,
+                               etag = None, last_modified = None, entity_url = None)
+
+        self.sr = model.SiteRefresh(site=self.site, status="new")
+
+        self.rm = RefreshManager(self.sm, self.dcm)
+        d = self.rm.doCommand(self.sr.id)
+
+        d.addCallback(self.confirmEntityProperties)
+
+        return d
+
+    def confirmEntityProperties(self, *args, **kw):
+        self.site.sync()
+        history = self.site.history
+
+        assert(history)
+        assert(len(history) == 1)
+
+        assert(self.site.etag == "abc123")
+        assert(self.site.last_modified == "Mon, 03 Nov 2008 01:27:18 GMT")
+        assert(self.site.entity_url == "http://localhost:9090/test/modified")
+
+
+    # This tests refreshing a feed and not updating from an etag/last-modified
+    def test_RefreshSiteManagerEntityHit(self):
+        self.setupManagers()
+        model = self.m.model
+        p = model.Person(name="No One")
+        self.site = model.Site(person=p, url="http://localhost:9090/test/modified", type="feed",
+                               feed="http://localhost:9090/test/modified",
+                               feedType="rss2", title="something", created=datetime.utcnow(),
+                               lastUpdate = None,
+                               lastPoll = None, current = None,
+                               etag = "abc123",
+                               last_modified = "Mon, 03 Nov 2008 01:27:18 GMT",
+                               entity_url = "http://localhost:9090/test/modified")
+
+        self.sr = model.SiteRefresh(site=self.site, status="new")
+
+        self.rm = RefreshManager(self.sm, self.dcm)
+        d = self.rm.doCommand(self.sr.id)
+
+        d.addCallback(self.confirmEntityHit)
+
+        return d
+
+    def confirmEntityHit(self, retval, **kw):
+        print retval
+        self.site.sync()
+        history = self.site.history
+
+        # Note that the feed here is actually one, but we test for
+        # zero because we never wanted to invoke the parser
+        assert(len(history) == 0)
+
+        # And in the error field we should get an error that indicates we hit the error path
+        self.sr.sync()
+        assert(self.sr.error == "http_not_modified")
index 49da041..86b40fd 100644 (file)
@@ -194,6 +194,12 @@ class Site(SQLObject):
     isRemoved = BoolCol(default=None)
     # time of removal
     removed = DateTimeCol(default=None)
+    # last etag
+    etag = UnicodeCol(default=None)
+    # last modified
+    last_modified = UnicodeCol(default=None)
+    # entity url for last modified / etag
+    entity_url = UnicodeCol(default=None)
     # site history
     history = MultipleJoin('SiteHistory')
     # the current entry - used by linkedin right now and anything else
index 74277b6..48e3378 100644 (file)
@@ -44,7 +44,7 @@ class TestController(controllers.Controller):
     def redirect4(self):
         raise redirect("/test/modified")
 
-    @expose()
+    @expose(content_type="application/rss+xml")
     def modified(self):
         if request.headers.get("if-none-match", None) == "abc123":
             response.status = 304
@@ -55,5 +55,31 @@ class TestController(controllers.Controller):
             response.status = 304
 
         response.headers["Last-Modified"] = "Mon, 03 Nov 2008 01:27:18 GMT"
-        
-        return dict()
+
+        data = """<?xml version="1.0"?>
+<rss version="2.0">
+   <channel>
+      <title>big fat title</title>
+      <link>http://localhost/rock-on</link>
+      <description>This is a description.</description>
+      <language>en-us</language>
+      <pubDate>Tue, 10 Jun 2003 04:00:00 GMT</pubDate>
+
+      <lastBuildDate>Tue, 10 Jun 2003 09:41:01 GMT</lastBuildDate>
+      <docs>http://blogs.law.harvard.edu/tech/rss</docs>
+      <generator>Some Crazy Generator</generator>
+      <managingEditor>editor@example.com</managingEditor>
+      <webMaster>webmaster@example.com</webMaster>
+      <item>
+         <title>This is an entry title</title>
+         <link>http://localhost/some-other-link</link>
+         <description>Blah blah blah blah.</description>
+         <pubDate>Tue, 20 May 2003 08:56:02 GMT</pubDate>
+         <guid>1234567890</guid>
+      </item>
+   </channel>
+</rss>
+"""
+        return data
+
+