added https to couchdb + cleanup + added couchdb to docs

This commit is contained in:
Keith Tunstead 2019-11-08 10:57:10 +01:00
parent afb0afa970
commit e69f5de584
8 changed files with 44 additions and 30 deletions

5
.env
View File

@ -1,5 +0,0 @@
COUCHDB_USER=user
COUCHDB_PASSWORD=password
COUCHDB_NAME=couchdb
COUCHDB_PORT_5984_TCP_ADDR=couchdb
COUCHDB_PORT_5984_TCP_PORT=5984

View File

@ -5,7 +5,7 @@ A Powerful Spider(Web Crawler) System in Python. **[TRY IT NOW!][Demo]**
- Write script in Python
- Powerful WebUI with script editor, task monitor, project manager and result viewer
- [MySQL](https://www.mysql.com/), [MongoDB](https://www.mongodb.org/), [Redis](http://redis.io/), [SQLite](https://www.sqlite.org/), [Elasticsearch](https://www.elastic.co/products/elasticsearch); [PostgreSQL](http://www.postgresql.org/) with [SQLAlchemy](http://www.sqlalchemy.org/) as database backend
- [MySQL](https://www.mysql.com/), [CouchDB](https://couchdb.apache.org), [MongoDB](https://www.mongodb.org/), [Redis](http://redis.io/), [SQLite](https://www.sqlite.org/), [Elasticsearch](https://www.elastic.co/products/elasticsearch); [PostgreSQL](http://www.postgresql.org/) with [SQLAlchemy](http://www.sqlalchemy.org/) as database backend
- [RabbitMQ](http://www.rabbitmq.com/), [Redis](http://redis.io/) and [Kombu](http://kombu.readthedocs.org/) as message queue
- Task priority, retry, periodical, recrawl by age, etc...
- Distributed architecture, Crawl Javascript pages, Python 2.{6,7}, 3.{3,4,5,6} support, etc...

View File

@ -1,6 +1,7 @@
version: "3.7"
# docker build ./ -t pyspider:latest
# replace /path/to/dir/ to point to config_example.json
services:
rabbitmq:
@ -16,8 +17,12 @@ services:
- pyspider
ports:
- "5984:5984"
env_file: .env
environment:
- COUCHDB_NAME=couchdb
- COUCHDB_USER=user
- COUCHDB_PASSWORD=password
- COUCHDB_HTTPS=true
# OR we can replace couchdb with mysql
#mysql:
# image: mysql:latest
# container_name: mysql
@ -27,15 +32,13 @@ services:
# - MYSQL_ALLOW_EMPTY_PASSWORD=yes
# networks:
# - pyspider
# env_file: .env
phantomjs:
image: pyspider:latest
container_name: phantomjs
networks:
- pyspider
env_file: .env
volumes:
- /Users/Keith/Documents/Projects/python/python_projects/pyspider/pyspider/config_example.json:/opt/pyspider/config.json
- /path/to/dir/config_example.json:/opt/pyspider/config.json
command: -c config.json phantomjs
depends_on:
- couchdb
@ -46,9 +49,8 @@ services:
container_name: result
networks:
- pyspider
env_file: .env
volumes:
- /Users/Keith/Documents/Projects/python/python_projects/pyspider/pyspider/config_example.json:/opt/pyspider/config.json
- /path/to/dir/config_example.json:/opt/pyspider/config.json
command: -c config.json result_worker
depends_on:
- couchdb
@ -59,9 +61,8 @@ services:
image: pyspider:latest
networks:
- pyspider
env_file: .env
volumes:
- /Users/Keith/Documents/Projects/python/python_projects/pyspider/pyspider/config_example.json:/opt/pyspider/config.json
- /path/to/dir/config_example.json:/opt/pyspider/config.json
command: -c config.json processor
depends_on:
- couchdb
@ -72,9 +73,8 @@ services:
container_name: fetcher
networks:
- pyspider
env_file: .env
volumes:
- /Users/Keith/Documents/Projects/python/python_projects/pyspider/pyspider/config_example.json:/opt/pyspider/config.json
- /path/to/dir/config_example.json:/opt/pyspider/config.json
command : -c config.json fetcher
depends_on:
- couchdb
@ -85,9 +85,8 @@ services:
container_name: scheduler
networks:
- pyspider
env_file: .env
volumes:
- /Users/Keith/Documents/Projects/python/python_projects/pyspider/pyspider/config_example.json:/opt/pyspider/config.json
- /path/to/dir/config_example.json:/opt/pyspider/config.json
command: -c config.json scheduler
depends_on:
- couchdb
@ -100,9 +99,8 @@ services:
- "5050:5000"
networks:
- pyspider
env_file: .env
volumes:
- /Users/Keith/Documents/Projects/python/python_projects/pyspider/pyspider/config_example.json:/opt/pyspider/config.json
- /path/to/dir/config_example.json:/opt/pyspider/config.json
environment:
- SCHEDULER_PORT_23333_TCP_ADDR=scheduler
command: -c config.json webui

View File

@ -72,6 +72,8 @@ sqlite:
mongodb:
mongodb+type://[username:password@]host1[:port1][,host2[:port2],...[,hostN[:portN]]][/[database][?options]]
more: http://docs.mongodb.org/manual/reference/connection-string/
couchdb:
couchdb+type://[username:password@]host[:port]
sqlalchemy:
sqlalchemy+postgresql+type://user:passwd@host:port/database
sqlalchemy+mysql+mysqlconnector+type://user:passwd@host:port/database

View File

@ -8,7 +8,7 @@ To deploy pyspider in product environment, running component in each process and
Installation
------------
To deploy pyspider components in each single processes, you need at least one database service. pyspider now supports [MySQL](http://www.mysql.com/), [MongoDB](http://www.mongodb.org/) and [PostgreSQL](http://www.postgresql.org/). You can choose one of them.
To deploy pyspider components in each single processes, you need at least one database service. pyspider now supports [MySQL](http://www.mysql.com/), [CouchDB](https://couchdb.apache.org), [MongoDB](http://www.mongodb.org/) and [PostgreSQL](http://www.postgresql.org/). You can choose one of them.
And you need a message queue service to connect the components together. You can use [RabbitMQ](http://www.rabbitmq.com/) or [Redis](http://redis.io/) as message queue.
@ -63,6 +63,8 @@ sqlite:
mongodb:
mongodb+type://[username:password@]host1[:port1][,host2[:port2],...[,hostN[:portN]]][/[database][?options]]
more: http://docs.mongodb.org/manual/reference/connection-string/
couchdb:
couchdb+type://[username:password@]host[:port][?options]]
sqlalchemy:
sqlalchemy+postgresql+type://user:passwd@host:port/database
sqlalchemy+mysql+mysqlconnector+type://user:passwd@host:port/database

View File

@ -5,7 +5,7 @@ A Powerful Spider(Web Crawler) System in Python. **[TRY IT NOW!][Demo]**
- Write script in Python
- Powerful WebUI with script editor, task monitor, project manager and result viewer
- [MySQL](https://www.mysql.com/), [MongoDB](https://www.mongodb.org/), [Redis](http://redis.io/), [SQLite](https://www.sqlite.org/), [Elasticsearch](https://www.elastic.co/products/elasticsearch); [PostgreSQL](http://www.postgresql.org/) with [SQLAlchemy](http://www.sqlalchemy.org/) as database backend
- [MySQL](https://www.mysql.com/), [CouchDB](https://couchdb.apache.org), [MongoDB](https://www.mongodb.org/), [Redis](http://redis.io/), [SQLite](https://www.sqlite.org/), [Elasticsearch](https://www.elastic.co/products/elasticsearch); [PostgreSQL](http://www.postgresql.org/) with [SQLAlchemy](http://www.sqlalchemy.org/) as database backend
- [RabbitMQ](http://www.rabbitmq.com/), [Redis](http://redis.io/) and [Kombu](http://kombu.readthedocs.org/) as message queue
- Task priority, retry, periodical, recrawl by age, etc...
- Distributed architecture, Crawl Javascript pages, Python 2&3, etc...

View File

@ -34,7 +34,7 @@ def connect_database(url):
elasticsearch:
elasticsearch+type://host:port/?index=pyspider
couchdb:
couchdb+type://host[:port]
couchdb+type://[username:password@]host[:port]
local:
local+projectdb://filepath,filepath
@ -207,14 +207,29 @@ def _connect_elasticsearch(parsed, dbtype):
def _connect_couchdb(parsed, dbtype, url):
# TODO: Add https + auth as parameters
url = "http://" + parsed.netloc + "/"
if os.environ.get('COUCHDB_HTTPS'):
url = "https://" + parsed.netloc + "/"
else:
url = "http://" + parsed.netloc + "/"
params = {}
params['username'] = os.environ.get('COUCHDB_USER') or 'user'
params['password'] = os.environ.get('COUCHDB_PASSWORD') or 'password'
username = None
password = None
if '@' in parsed.netloc:
# netloc looks like: 'user:pass@couchdb:999'
url = parsed.netloc[parsed.netloc.find("@")+1:]
# extract the username and password
username = parsed.netloc[:parsed.netloc.find(":")]
password = parsed.netloc[parsed.netloc.find(":")+1:parsed.netloc.find("@")]
# default to env, then url, then hard coded
params['username'] = os.environ.get('COUCHDB_USER') or username or 'user'
params['password'] = os.environ.get('COUCHDB_PASSWORD') or password or 'password'
# create required CouchDB databases if not already present
requests.put(url+"_users")
requests.put(url+"_replicator")
# create the admin user
# NOTE: Over docker, this user is already created when COUCHDB_USER and COUCHDB_PASSWORD are set
requests.put(url+'_node/_local/_config/admins/'+ params['username'],

View File

@ -114,8 +114,10 @@ def cli(ctx, **kwargs):
elif os.environ.get('COUCHDB_NAME'):
kwargs[db] = utils.Get(lambda db=db: connect_database(
'couchdb+%s://%s:%s/%s' % (
db, os.environ['COUCHDB_PORT_5984_TCP_ADDR'],
os.environ['COUCHDB_PORT_5984_TCP_PORT'], db)))
db,
os.environ['COUCHDB_PORT_5984_TCP_ADDR'] or 'couchdb',
os.environ['COUCHDB_PORT_5984_TCP_PORT'] or '5984',
db)))
elif ctx.invoked_subcommand == 'bench':
if kwargs['data_path'] == './data':
kwargs['data_path'] += '/bench'