2008-11-22 Christopher Blizzard <blizzard@0xdeadbeef.com>

author blizzard <blizzard@ae879524-a8bd-4c4c-a5ea-74d2e5fc5a2c>

Sun, 23 Nov 2008 04:41:42 +0000 (04:41 +0000)

committer blizzard <blizzard@ae879524-a8bd-4c4c-a5ea-74d2e5fc5a2c>

Sun, 23 Nov 2008 04:41:42 +0000 (04:41 +0000)
author blizzard <blizzard@ae879524-a8bd-4c4c-a5ea-74d2e5fc5a2c>
Sun, 23 Nov 2008 04:41:42 +0000 (04:41 +0000)
committer blizzard <blizzard@ae879524-a8bd-4c4c-a5ea-74d2e5fc5a2c>
Sun, 23 Nov 2008 04:41:42 +0000 (04:41 +0000)
diff --git a/ChangeLog b/ChangeLog

index a37117e2351dba16c3e7d6093ffbad4b0a4de199..5a73b4a433029993bf54e6ec3023e96820e422b8 100644 (file)
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,75 @@
+2008-11-22  Christopher Blizzard  <blizzard@0xdeadbeef.com>
+
+       * whoisi/test_controller.py (TestController.modified): Now returns
+       a valid RSS feed + data type for tests.
+
+       * whoisi/model.py (Site): Add etag, last_modified and entity_url
+       entries to the site table.
+
+       * tests/twisted/network/test_download.py: Lots of changes here to
+       support the new download command format (using hash instead of
+       direct call.)
+
+       * tests/twisted/local/test_feedparse_perf.py: Same.
+
+       * tests/twisted/local/test_feedparse.py: Same.
+
+       * tests/twisted/network/test_feedrefresh.py
+       (TestFeedRefresh.test_RefreshSiteManagerEntityProperties): This
+       test makes sure that we set entity properties in the site table
+       after we hit a site that includes them.
+       (TestFeedRefresh.test_RefreshSiteManagerEntityHit): This test
+       makes sure that we return early and don't parse when we send a
+       matching etag or last-modified along with a request.
+
+       * services/command/siterefresh.py (RefreshSiteDone.srDone): Save
+       etag, last-modified and entity_url info in the site if we have it.
+       (RefreshSiteDone.done): When returning the data to the master
+       process add a http_entity_hit=0 in the dict so we know we did a
+       download. (For future use.)
+       (RefreshSiteError.handleError): Handle the DownloadCommand
+       throwing a NotModifiedError which means that we don't have to do
+       any parsing or updating of information.  Short cut to exit.
+       Return value will include a http_entity_hit=1 for future use.  We
+       also set the error field to http_not_modified when we hit this
+       condition.  Also update the error field in the SiteRefresh table
+       when there's a real error.
+
+       * services/command/controller.py (RefreshManager.__init__): Use
+       new DownloadResourceSaveState after a download as part of a
+       refresh.
+
+       * services/command/newsite.py (NewSiteTryURL.doCommand): When
+       calling the download command pass in the url as part of a
+       dictionary.
+       (NewSiteTryURL.downloadDone): More args["filename"] changes.
+       (NewSiteTryURL.startSecondDownload): Same.
+       (NewSiteTryURL.secondDownloadDone): Same.
+       (NewSiteTryURL.tryFeed): Same.
+
+       * services/command/download.py (DownloadResourceSaveState): Shim
+       command that takes the download data and saves it into the state
+       for later commands.
+       (DownloadCommand.doCommand): New code to handle etag,
+       last_modified and entity_url info as arguments to this command.
+       (DownloadCommand.downloadDone): Data is now returned as a hash
+       that includes filename, etag, last_modified and the url stack of
+       downloads.
+
+       * services/command/feedparse.py (FeedRefreshSetup.gotNewSite):
+       Gets the etag, last_modified and entity_url out of the database
+       when setting up for a feed refresh.
+       (FeedRefreshSetup.gotFeed): When returning with a setup refresh
+       the next command is the download so set up everything the download
+       needs to send an etag + last-modified header if we can.
+       (FeedParseCommand.doCommand): Convert to use args["filename"]
+       instead of just filename since the downloadcommand now returns
+       more than just the filename.
+
+       * services/command/linkedin.py (LinkedInScrapeCommand.doCommand):
+       Convert linkedin code to use a hash["filename"] instead of just
+       the filename.
+
  2008-11-18  Christopher Blizzard  <blizzard@0xdeadbeef.com>
  
         * test-ws.cfg: Enable base_url_filter.base_url to localhost:9090
diff --git a/services/command/controller.py b/services/command/controller.py

index 872d0e5f987fd6bc1198875b35c0a2b5f4c23925..122b77b04f8d6ef687b0026f7486e1e7425f45b9 100644 (file)
--- a/services/command/controller.py
+++ b/services/command/controller.py
@@ -24,7 +24,7 @@ from twisted.internet import defer
  
  from services.command.base import CommandManager
  
-from services.command.download import DownloadCommand
+from services.command.download import DownloadCommand, DownloadResourceSaveState
  from services.command.newsite import NewSiteSetup, NewSiteTryURL, \
      NewSiteDone, NewSiteError, NewSiteCreate, NewSiteAudit
  from services.command.feedparse import FeedParseCommand, \
@@ -73,6 +73,7 @@ class RefreshManager(ProtoManager):
          CommandManager.__init__(self)
          self.commands = [ FeedRefreshSetup(dcm),          # get the feed url from the database
                            DownloadCommand(),              # download the feed
+                          DownloadResourceSaveState(),    # save the etag and last-modified from the download
                            FeedParseCommand(sm),           # send it out to the parser
                            FeedUpdateDatabaseCommand(dcm), # update the database
                            RefreshSiteDone(dcm)            # update our status
diff --git a/services/command/download.py b/services/command/download.py

index 15f44e0e79b7744cae3061a5b0e1f5eda787c436..99c041491e2e29724ccf06370c3dbc02012e9ec4 100644 (file)
--- a/services/command/download.py
+++ b/services/command/download.py
@@ -52,6 +52,26 @@ def confirm_twitter_info():
      except:
          pass
  
+class DownloadResourceSaveState(BaseCommand):
+    """
+    This takes the etag and last-modified data from a download and
+    saves it into the state.
+    """
+    def __init__(self):
+        BaseCommand.__init__(self)
+        self.name = "download"
+
+    def doCommand(self, state, args):
+        d = defer.Deferred()
+
+        state["download_etag"] = args.get("etag", None)
+        state["download_last_modified"] = args.get("last_modified", None)
+        state["download_url_stack"] = args.get("url_stack", None)
+
+        d.callback(dict(filename=args["filename"]))
+
+        return d
+
  class DownloadCommand(BaseCommand):
      """
      DownloadCommand takes a URL as an argument and will return a
@@ -63,15 +83,38 @@ class DownloadCommand(BaseCommand):
          self.d = defer.Deferred()
          self.url_stack = []
  
-    def doCommand(self, state, url=None,
-                  etag=None, last_modified=None, entity_url=None,
-                  *args, **kw):
+    def doCommand(self, state, arg, *args, **kw):
          """
-        Pass in a url.  This will return a deferred that will
-        eventually call back to you with a (result, filename) pair to
-        where the url has been downloaded.  It is your responsibility
-        to delete the file once you have finished with it.
+        Pass in a dict that includes url and optional etag +
+        last_modified information.  This will return a deferred that
+        will eventually call back to you with a (result,
+        dict(filename, etag, last_modified, url_stack)) pair to where
+        the url has been downloaded.  It is your responsibility to
+        delete the file once you have finished with it.
          """
+
+        url = None
+        etag = None
+        last_modified = None
+        entity_url = None
+
+        if arg:
+            url = arg.get("url", None)
+            etag = arg.get("etag", None)
+            last_modified = arg.get("last_modified", None)
+            entity_url = arg.get("entity_url", None)
+
+        # have to clean these up because the http code in twisted
+        # loses its crap if we pass down unicode data
+        if url:
+            url = str(url)
+        if etag:
+            etag = str(etag)
+        if last_modified:
+            last_modified = str(last_modified)
+        if entity_url:
+            entity_url = str(entity_url)
+
          self.state = state
          self.etag = etag
          self.last_modified = last_modified
@@ -136,10 +179,14 @@ class DownloadCommand(BaseCommand):
          print("  etag %s" % str(etag))
          print("  last-modified %s" % str(last_modified))
          print("  url stack %s" % str(self.url_stack))
-        self.state["download_etag"] = result["etag"]
-        self.state["download_last_modified"] = result["last_modified"]
-        self.state["download_url_stack"] = self.url_stack
-        self.d.callback(filename)
+
+        retval = dict()
+        retval["filename"] = filename
+        retval["etag"] = result["etag"]
+        retval["last_modified"] = result["last_modified"]
+        retval["url_stack"] = self.url_stack
+
+        self.d.callback(retval)
  
      def downloadError(self, failure):
          if failure.check(error.PageRedirect):
diff --git a/services/command/feedparse.py b/services/command/feedparse.py

index 6d3881af1b1bcd937042acb3ce039fcb1947c366..465a238b02c1cc32b7e2ace8f68342d6a770d7c1 100644 (file)
--- a/services/command/feedparse.py
+++ b/services/command/feedparse.py
@@ -68,7 +68,7 @@ class FeedRefreshSetup(BaseCommand):
  
          # Get the info for this feed out of the database
          q = """
-            SELECT feed, url FROM site WHERE id = %s
+            SELECT feed, url, etag, last_modified, entity_url FROM site WHERE id = %s
              """
  
          d = self.dcm.runQuery(q, site_id)
@@ -78,9 +78,18 @@ class FeedRefreshSetup(BaseCommand):
      def gotFeed(self, results):
          self.state["feed_url"] = results[0][0]
          self.state["url"] = results[0][1]
-        print("  feed url is %s" % self.state["feed_url"])
-        print("  base url is %s" % self.state["url"])
-        self.d.callback(self.state["feed_url"])
+        self.state["etag"] = results[0][2]
+        self.state["last_modified"] = results[0][3]
+        self.state["entity_url"] = results[0][4]
+        print("  feed url is      %s" % self.state["feed_url"])
+        print("  base url is      %s" % self.state["url"])
+        print("  etag is          %s" % self.state["etag"])
+        print("  last_modified is %s" % self.state["last_modified"])
+        print("  entity_url is    %s" % self.state["entity_url"])
+        self.d.callback(dict(url = self.state["feed_url"],
+                             etag = self.state["etag"],
+                             last_modified = self.state["last_modified"],
+                             entity_url = self.state["entity_url"]))
  
      def error(self, failure):
          print("got error: %s" % failure.getErrorMessage())
@@ -97,13 +106,17 @@ class FeedParseCommand(BaseCommand):
          ss = None
          self.name = "feedparse"
  
-    def doCommand(self, state, filename):
+    def doCommand(self, state, args):
          """
          This command will take the filename that's passed in from the
          RSS download and parse it.  It will return a filename that
          contains the parsed contents.  It will also check if there's a
          pre-parsed file and just return that immediately if it can.
          """
+        filename = None
+        if args:
+            filename = args["filename"]
+
          if state.has_key("try_url_parsed_feed_filename"):
              d = defer.Deferred()
              d.callback(state["try_url_parsed_feed_filename"])
diff --git a/services/command/linkedin.py b/services/command/linkedin.py

index 21deacf6350973777af9758ce6b0e072131aa52d..df63279855265f955398c5b398f224c7a2062941 100644 (file)
--- a/services/command/linkedin.py
+++ b/services/command/linkedin.py
@@ -254,7 +254,8 @@ class LinkedInScrapeCommand(BaseCommand):
          self.name = "linkedin-scrape"
          self.d = None
  
-    def doCommand(self, state, filename):
+    def doCommand(self, state, args):
+        filename = args["filename"]
          self.state = state
          self.entries = None
          self.found_user = False
diff --git a/services/command/newsite.py b/services/command/newsite.py

index 8d73cc345baa8470fd49c4bea3bb78a17609f7aa..d8f5be4b37312a8b9f643694d5593fb869cc5783 100644 (file)
--- a/services/command/newsite.py
+++ b/services/command/newsite.py
@@ -175,7 +175,7 @@ class NewSiteTryURL(BaseCommand):
  
          # Start off by trying to download the page
          download = DownloadCommand()
-        d = download.doCommand(self.state, str(self.state["try_url"]))
+        d = download.doCommand(self.state, dict(url=str(self.state["try_url"])))
          d.addCallback(self.downloadDone)
          d.addErrback(self.downloadError)
  
@@ -185,7 +185,8 @@ class NewSiteTryURL(BaseCommand):
  # Downloading the first URL
  ####
  
-    def downloadDone(self, filename):
+    def downloadDone(self, args):
+        filename = args["filename"]
          print("  download done filename: %s" % filename)
          self.state["try_url_filename"] = filename
          # assume it's HTML and try to scrape out any link header information
@@ -312,11 +313,12 @@ class NewSiteTryURL(BaseCommand):
  
      def startSecondDownload(self):
          download = DownloadCommand()
-        d = download.doCommand(self.state, str(self.state["feed_url"]))
+        d = download.doCommand(self.state, dict(url=str(self.state["feed_url"])))
          d.addCallback(self.secondDownloadDone)
          d.addErrback(self.secondDownloadFailed)
  
-    def secondDownloadDone(self, filename):
+    def secondDownloadDone(self, args):
+        filename = args["filename"]
          print("  second download done")
          self.state["try_url_filename"] = filename
          self.tryFeed()
@@ -332,7 +334,7 @@ class NewSiteTryURL(BaseCommand):
      def tryFeed(self):
          feed_parse = FeedParseCommand(self.sm)
          print("  XXX parsing feed, but need to fix relative URLs first here somehow")
-        d = feed_parse.doCommand(self.state, self.state["try_url_filename"])
+        d = feed_parse.doCommand(self.state, dict(filename=self.state["try_url_filename"]))
          d.addCallback(self.feedParseSucceeded)
          d.addErrback(self.feedParseFailed)
  
diff --git a/services/command/siterefresh.py b/services/command/siterefresh.py

index dacc224228e11aa902ad9353ff8294cf03eff6cc..9a6c51223300ea2ad60a87a193bfc7b423a1cf0a 100644 (file)
--- a/services/command/siterefresh.py
+++ b/services/command/siterefresh.py
@@ -24,6 +24,7 @@
  
  from twisted.internet import defer
  from services.command.base import BaseCommand
+from services.command.exceptions import NotModifiedError
  
  import datetime
  
@@ -52,15 +53,28 @@ class RefreshSiteDone(BaseCommand):
      def srDone(self, *args, **kw):
          id = self.state["site_id"]
          q = """
-            UPDATE site SET last_poll = %s where id = %s
+            UPDATE site SET last_poll = %s, etag = %s, last_modified = %s, entity_url = %s where id = %s
              """
-        d = self.dcm.runQuery(q, (datetime.datetime.utcnow(), id))
+
+        entity_url = None
+
+        try:
+            entity_url = self.state["download_url_stack"][-1]
+        except:
+            pass
+
+        d = self.dcm.runQuery(q, (datetime.datetime.utcnow(),
+                                  self.state.get("download_etag", None),
+                                  self.state.get("download_last_modified", None),
+                                  entity_url,
+                                  id))
          d.addCallback(self.done)
          d.addErrback(self.error)
  
      def done(self, *args, **kw):
          # return any new site_history_new_ids if we have them
-        retval = dict(site_history_new_ids = self.state.get("site_history_new_ids", []))
+        retval = dict(site_history_new_ids = self.state.get("site_history_new_ids", []),
+                      http_entity_hit=0)
          
          self.d.callback(retval)
  
@@ -77,17 +91,33 @@ class RefreshSiteError:
          id = state["site_refresh_id"]
          self.orig_failure = failure
  
-        # XXX add an error here at some point
+        self.it_failed = True
  
-        q = """
-            UPDATE site_refresh SET status = "error", error = %s where id = %s
-            """
+        if failure.check(NotModifiedError):
+            q = """
+                UPDATE site_refresh set status = "done", error = %s where id = %s
+                """
+            error = "http_not_modified"
+            self.it_failed = False
  
-        d = self.dcm.runQuery(q, (None, id))
+        else:
+
+            q = """
+                UPDATE site_refresh SET status = "error", error = %s where id = %s
+                """
+            error = str(failure)
+        
+        d = self.dcm.runQuery(q, (error, id))
  
          d.addCallback(self.updateDone)
  
          return self.d
  
      def updateDone(self, *args, **kw):
-        self.d.errback(self.orig_failure)
+        if self.it_failed:
+            self.d.errback(self.orig_failure)
+            return
+
+        # this makes the error handler return success
+        retval = dict(site_history_new_ids = [], http_entity_hit=1)
+        self.d.callback(retval)
diff --git a/tests/twisted/local/test_feedparse.py b/tests/twisted/local/test_feedparse.py

index 95f8b792bbb8e8416baf2758c55b0108d511ca8a..171e435f5261cf62f9dd84ec96efa85ae352026a 100644 (file)
--- a/tests/twisted/local/test_feedparse.py
+++ b/tests/twisted/local/test_feedparse.py
@@ -55,7 +55,7 @@ class TestFeedParse(unittest.TestCase):
          self.fpc = FeedParseCommand(self.sm)
          self.state["url"] = "http://www.0xdeadbeef.com/weblog/"
          self.state["feed_url"] = "http://www.0xdeadbeef.com/weblog/feed=rss2"
-        d = self.fpc.doCommand(self.state, "../tests/twisted/local/data/beef-2.rss2")
+        d = self.fpc.doCommand(self.state, dict(filename="../tests/twisted/local/data/beef-2.rss2"))
          d.addCallback(self.feedParseParsed)
          return d
  
@@ -81,7 +81,7 @@ class TestFeedParse(unittest.TestCase):
  
          # now that we've loaded the database, see if we add an entry successfully
          self.fpc = FeedParseCommand(self.sm)
-        d = self.fpc.doCommand(self.state, "../tests/twisted/local/data/beef.rss2")
+        d = self.fpc.doCommand(self.state, dict(filename="../tests/twisted/local/data/beef.rss2"))
          d.addCallback(self.feedParseUpdated)
          return d
  
@@ -103,7 +103,7 @@ class TestFeedParse(unittest.TestCase):
          self.fpc = FeedParseCommand(self.sm)
          self.state["url"] = "http://www.0xdeadbeef.com/weblog/"
          self.state["feed_url"] = "http://www.0xdeadbeef.com/weblog/feed=rss2"
-        d = self.fpc.doCommand(self.state, "../tests/twisted/local/data/beef-no-ids-2.rss2")
+        d = self.fpc.doCommand(self.state, dict(filename="../tests/twisted/local/data/beef-no-ids-2.rss2"))
          d.addCallback(self.stupidFeedParseParsed)
          return d
  
@@ -133,7 +133,7 @@ class TestFeedParse(unittest.TestCase):
          # a new entry since none of the entries have ids associated
          # with them
          self.fpc = FeedParseCommand(self.sm)
-        d = self.fpc.doCommand(self.state, "../tests/twisted/local/data/beef-no-ids.rss2")
+        d = self.fpc.doCommand(self.state, dict(filename="../tests/twisted/local/data/beef-no-ids.rss2"))
          d.addCallback(self.stupidFeedParseUpdated)
          return d
  
diff --git a/tests/twisted/local/test_feedparse_perf.py b/tests/twisted/local/test_feedparse_perf.py

index 340cb9028dada64c0f8ef1e36be6727f53e03478..3d9bb0b8e0281edc33d172a77ae2340cead1fe7e 100644 (file)
--- a/tests/twisted/local/test_feedparse_perf.py
+++ b/tests/twisted/local/test_feedparse_perf.py
@@ -30,7 +30,7 @@ class TestFeedParsePerf(unittest.TestCase):
  
      def doIteration(self):
          self.fpc = FeedParseCommand(self.sm)
-        d = self.fpc.doCommand(self.state, "../tests/twisted/local/data/beef-2.rss2")
+        d = self.fpc.doCommand(self.state, dict(filename="../tests/twisted/local/data/beef-2.rss2"))
          d.addCallback(self.parseDone)
  
      def parseDone(self, filename):
diff --git a/tests/twisted/network/test_download.py b/tests/twisted/network/test_download.py

index 4280f5f8f615f1c0f4b93061c129b19625da2963..3b57b99b710eb3cdcc91d391419f4ed72c5bcaeb 100644 (file)
--- a/tests/twisted/network/test_download.py
+++ b/tests/twisted/network/test_download.py
@@ -113,8 +113,8 @@ class TestDownload(unittest.TestCase):
          state = dict()
          state["url"] = "http://localhost:9090/test/modified"
  
-        d = c.doCommand(state, etag="abc123",
-                        entity_url="http://localhost:9090/test/modified")
+        d = c.doCommand(state, dict(etag="abc123",
+                                    entity_url="http://localhost:9090/test/modified"))
  
          d.addCallback(lambda x: unittest.fail("should give a 304"))
          d.addErrback(lambda f: f.trap(NotModifiedError))
@@ -130,15 +130,17 @@ class TestDownload(unittest.TestCase):
          self.state = dict()
          self.state["url"] = "http://localhost:9090/test/modified"
  
-        d = c.doCommand(self.state, etag="abc123x",
-                        entity_url="http://localhost:9090/test/modified")
+        d = c.doCommand(self.state, dict(etag="abc123x",
+                                         entity_url="http://localhost:9090/test/modified"))
  
          d.addCallback(self._downloadDoneCheckETag)
  
          return d
  
-    def _downloadDoneCheckETag(self, filename):
-        etag = self.state["download_etag"]
+    def _downloadDoneCheckETag(self, args):
+        filename = args["filename"]
+        etag = args["etag"]
+
          assert(filename)
          print("filename %s" % filename)
          print("etag %s" % etag)
@@ -153,9 +155,8 @@ class TestDownload(unittest.TestCase):
          state = dict()
          state["url"] = "http://localhost:9090/test/modified"
  
-        d = c.doCommand(state,
-                        last_modified="Mon, 03 Nov 2008 01:27:18 GMT",
-                        entity_url="http://localhost:9090/test/modified")
+        d = c.doCommand(state, dict(last_modified="Mon, 03 Nov 2008 01:27:18 GMT",
+                                    entity_url="http://localhost:9090/test/modified"))
  
          d.addCallback(lambda x: unittest.fail("should give a 304"))
          d.addErrback(lambda f: f.trap(NotModifiedError))
@@ -171,16 +172,17 @@ class TestDownload(unittest.TestCase):
          self.state = dict()
          self.state["url"] = "http://localhost:9090/test/modified"
  
-        d = c.doCommand(self.state,
-                        last_modified="Mon, 03 Nov 2008 01:27:19 GMT",
-                        entity_url="http://localhost:9090/test/modified")
+        d = c.doCommand(self.state, dict(last_modified="Mon, 03 Nov 2008 01:27:19 GMT",
+                                         entity_url="http://localhost:9090/test/modified"))
  
          d.addCallback(self._downloadDoneCheckLastModified)
  
          return d
  
-    def _downloadDoneCheckLastModified(self, filename):
-        last_modified = self.state["download_last_modified"]
+    def _downloadDoneCheckLastModified(self, args):
+        filename = args["filename"]
+        last_modified = args["last_modified"]
+
          assert(filename)
          print("filename %s" % filename)
          print("last_modified %s" % last_modified)
@@ -195,7 +197,7 @@ class TestDownload(unittest.TestCase):
          self.state = dict()
          self.state["url"] = "http://localhost:9090/test/redirect1"
  
-        d = c.doCommand(self.state)
+        d = c.doCommand(self.state, None)
  
          d.addCallback(self._downloadDone)
  
@@ -210,7 +212,7 @@ class TestDownload(unittest.TestCase):
          self.state = dict()
          self.state["url"] = "http://localhost:9090/test/redirect0"
  
-        d = c.doCommand(self.state)
+        d = c.doCommand(self.state, None)
  
          d.addCallback(lambda x: unittest.fail("should get too many redirects"))
          d.addErrback(lambda f: f.trap(TooManyRedirectsError))
diff --git a/tests/twisted/network/test_feedrefresh.py b/tests/twisted/network/test_feedrefresh.py

index 17f803c09e3911afcb196bc20f542f96d2213156..b4e6f1ddf888b871492be2b003bc2d9bc473db9e 100644 (file)
--- a/tests/twisted/network/test_feedrefresh.py
+++ b/tests/twisted/network/test_feedrefresh.py
@@ -139,3 +139,72 @@ class TestFeedRefresh(unittest.TestCase):
          d.addErrback(self.confirmRefreshFailed)
  
          return d
+
+    # This tests refreshing a feed and getting an etag
+    def test_RefreshSiteManagerEntityProperties(self):
+        self.setupManagers()
+        model = self.m.model
+        p = model.Person(name="No One")
+        self.site = model.Site(person=p, url="http://localhost:9090/test/modified", type="feed",
+                               feed="http://localhost:9090/test/modified",
+                               feedType="rss2", title="something", created=datetime.utcnow(),
+                               lastUpdate = None,
+                               lastPoll = None, current = None,
+                               etag = None, last_modified = None, entity_url = None)
+
+        self.sr = model.SiteRefresh(site=self.site, status="new")
+
+        self.rm = RefreshManager(self.sm, self.dcm)
+        d = self.rm.doCommand(self.sr.id)
+
+        d.addCallback(self.confirmEntityProperties)
+
+        return d
+
+    def confirmEntityProperties(self, *args, **kw):
+        self.site.sync()
+        history = self.site.history
+
+        assert(history)
+        assert(len(history) == 1)
+
+        assert(self.site.etag == "abc123")
+        assert(self.site.last_modified == "Mon, 03 Nov 2008 01:27:18 GMT")
+        assert(self.site.entity_url == "http://localhost:9090/test/modified")
+
+
+    # This tests refreshing a feed and not updating from an etag/last-modified
+    def test_RefreshSiteManagerEntityHit(self):
+        self.setupManagers()
+        model = self.m.model
+        p = model.Person(name="No One")
+        self.site = model.Site(person=p, url="http://localhost:9090/test/modified", type="feed",
+                               feed="http://localhost:9090/test/modified",
+                               feedType="rss2", title="something", created=datetime.utcnow(),
+                               lastUpdate = None,
+                               lastPoll = None, current = None,
+                               etag = "abc123",
+                               last_modified = "Mon, 03 Nov 2008 01:27:18 GMT",
+                               entity_url = "http://localhost:9090/test/modified")
+
+        self.sr = model.SiteRefresh(site=self.site, status="new")
+
+        self.rm = RefreshManager(self.sm, self.dcm)
+        d = self.rm.doCommand(self.sr.id)
+
+        d.addCallback(self.confirmEntityHit)
+
+        return d
+
+    def confirmEntityHit(self, retval, **kw):
+        print retval
+        self.site.sync()
+        history = self.site.history
+
+        # Note that the feed here is actually one, but we test for
+        # zero because we never wanted to invoke the parser
+        assert(len(history) == 0)
+
+        # And in the error field we should get an error that indicates we hit the error path
+        self.sr.sync()
+        assert(self.sr.error == "http_not_modified")
diff --git a/whoisi/model.py b/whoisi/model.py

index 49da041bd391fae6a376a4fee088daa11f2b34d3..86b40fd8f126b74097365fec27efec6c07cb2f23 100644 (file)
--- a/whoisi/model.py
+++ b/whoisi/model.py
@@ -194,6 +194,12 @@ class Site(SQLObject):
      isRemoved = BoolCol(default=None)
      # time of removal
      removed = DateTimeCol(default=None)
+    # last etag
+    etag = UnicodeCol(default=None)
+    # last modified
+    last_modified = UnicodeCol(default=None)
+    # entity url for last modified / etag
+    entity_url = UnicodeCol(default=None)
      # site history
      history = MultipleJoin('SiteHistory')
      # the current entry - used by linkedin right now and anything else
diff --git a/whoisi/test_controller.py b/whoisi/test_controller.py

index 74277b69d471a1e2a54e74b71b04fd819ac885e8..48e3378573f58033ae86b809f8bda6b44056fc3f 100644 (file)
--- a/whoisi/test_controller.py
+++ b/whoisi/test_controller.py
@@ -44,7 +44,7 @@ class TestController(controllers.Controller):
      def redirect4(self):
          raise redirect("/test/modified")
  
-    @expose()
+    @expose(content_type="application/rss+xml")
      def modified(self):
          if request.headers.get("if-none-match", None) == "abc123":
              response.status = 304
@@ -55,5 +55,31 @@ class TestController(controllers.Controller):
              response.status = 304
  
          response.headers["Last-Modified"] = "Mon, 03 Nov 2008 01:27:18 GMT"
-        
-        return dict()
+
+        data = """<?xml version="1.0"?>
+<rss version="2.0">
+   <channel>
+      <title>big fat title</title>
+      <link>http://localhost/rock-on</link>
+      <description>This is a description.</description>
+      <language>en-us</language>
+      <pubDate>Tue, 10 Jun 2003 04:00:00 GMT</pubDate>
+
+      <lastBuildDate>Tue, 10 Jun 2003 09:41:01 GMT</lastBuildDate>
+      <docs>http://blogs.law.harvard.edu/tech/rss</docs>
+      <generator>Some Crazy Generator</generator>
+      <managingEditor>editor@example.com</managingEditor>
+      <webMaster>webmaster@example.com</webMaster>
+      <item>
+         <title>This is an entry title</title>
+         <link>http://localhost/some-other-link</link>
+         <description>Blah blah blah blah.</description>
+         <pubDate>Tue, 20 May 2003 08:56:02 GMT</pubDate>
+         <guid>1234567890</guid>
+      </item>
+   </channel>
+</rss>
+"""
+        return data
+
+
author	blizzard <blizzard@ae879524-a8bd-4c4c-a5ea-74d2e5fc5a2c>
	Sun, 23 Nov 2008 04:41:42 +0000 (04:41 +0000)
committer	blizzard <blizzard@ae879524-a8bd-4c4c-a5ea-74d2e5fc5a2c>
	Sun, 23 Nov 2008 04:41:42 +0000 (04:41 +0000)
ChangeLog		patch \| blob \| history
services/command/controller.py		patch \| blob \| history
services/command/download.py		patch \| blob \| history
services/command/feedparse.py		patch \| blob \| history
services/command/linkedin.py		patch \| blob \| history
services/command/newsite.py		patch \| blob \| history
services/command/siterefresh.py		patch \| blob \| history
tests/twisted/local/test_feedparse.py		patch \| blob \| history
tests/twisted/local/test_feedparse_perf.py		patch \| blob \| history
tests/twisted/network/test_download.py		patch \| blob \| history
tests/twisted/network/test_feedrefresh.py		patch \| blob \| history
whoisi/model.py		patch \| blob \| history
whoisi/test_controller.py		patch \| blob \| history